diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.0848626286199215, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1253.625, + "completions/mean_terminated_length": 837.5238037109375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.00010607828577490188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 8e-06, + "loss": -0.0, + "num_tokens": 97652.0, + "reward": 1.7952632904052734, + "reward_std": 1.2897486686706543, + "rewards/reward_fn/mean": 1.7952632904052734, + "rewards/reward_fn/std": 1.2897486686706543, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 163.46875, + "completions/mean_terminated_length": 163.46875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.00021215657154980376, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.014410574920475483, + "learning_rate": 7.9996e-06, + "loss": 0.0006, + "num_tokens": 133123.0, + "reward": 2.7671689987182617, + "reward_std": 0.028316717594861984, + "rewards/reward_fn/mean": 2.7671689987182617, + "rewards/reward_fn/std": 0.02831670455634594, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 61.9375, + "completions/mean_terminated_length": 61.9375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.0003182348573247056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.498046875, + "kl": 0.039460036728996783, + "learning_rate": 7.9992e-06, + "loss": 0.0016, + "num_tokens": 171169.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 178.46875, + "completions/mean_terminated_length": 178.46875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0004243131430996075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.012384430738165975, + "learning_rate": 7.9988e-06, + "loss": 0.0005, + "num_tokens": 217520.0, + "reward": 2.901700258255005, + "reward_std": 0.015337609685957432, + "rewards/reward_fn/mean": 2.901700258255005, + "rewards/reward_fn/std": 0.015337574295699596, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 168.71875, + "completions/mean_terminated_length": 168.71875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.0005303914288745094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.02899795339908451, + "learning_rate": 7.9984e-06, + "loss": 0.0012, + "num_tokens": 255751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 148.59375, + "completions/mean_terminated_length": 148.59375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.0006364697146494112, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "kl": 0.0448116734623909, + "learning_rate": 7.998e-06, + "loss": 0.0018, + "num_tokens": 290010.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 243.5, + "completions/mean_terminated_length": 243.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.0007425480004243131, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.026772579178214073, + "learning_rate": 7.9976e-06, + "loss": 0.0011, + "num_tokens": 331850.0, + "reward": 3.281740188598633, + "reward_std": 0.5656248331069946, + "rewards/reward_fn/mean": 3.281740188598633, + "rewards/reward_fn/std": 0.5656247735023499, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 193.84375, + "completions/mean_terminated_length": 193.84375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.000848626286199215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.014506876847008243, + "learning_rate": 7.9972e-06, + "loss": 0.0006, + "num_tokens": 380037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 104.21875, + "completions/mean_terminated_length": 104.21875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.0009547045719741168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2109375, + "kl": 0.048403532593511045, + "learning_rate": 7.9968e-06, + "loss": 0.0019, + "num_tokens": 415468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 252.0, + "completions/mean_terminated_length": 252.0, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.0010607828577490189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02490026137093082, + "learning_rate": 7.9964e-06, + "loss": 0.001, + "num_tokens": 478956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 75.84375, + "completions/mean_terminated_length": 75.84375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.0011668611435239206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26953125, + "kl": 0.07843554252758622, + "learning_rate": 7.996e-06, + "loss": 0.0031, + "num_tokens": 518119.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 195.5625, + "completions/mean_terminated_length": 195.5625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.0012729394292988225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.03463225986342877, + "learning_rate": 7.995599999999998e-06, + "loss": 0.0014, + "num_tokens": 556985.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 457.15625, + "completions/mean_terminated_length": 457.15625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.0013790177150737244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.02525286824675277, + "learning_rate": 7.9952e-06, + "loss": 0.001, + "num_tokens": 619038.0, + "reward": 3.6924948692321777, + "reward_std": 0.541310727596283, + "rewards/reward_fn/mean": 3.6924948692321777, + "rewards/reward_fn/std": 0.541310727596283, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 625.96875, + "completions/mean_terminated_length": 625.96875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.0014850960008486263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.020447831630008295, + "learning_rate": 7.9948e-06, + "loss": 0.0008, + "num_tokens": 679165.0, + "reward": 2.675490617752075, + "reward_std": 0.3050808012485504, + "rewards/reward_fn/mean": 2.675490617752075, + "rewards/reward_fn/std": 0.3050808012485504, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 187.84375, + "completions/mean_terminated_length": 187.84375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0015911742866235282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.04133811534848064, + "learning_rate": 7.9944e-06, + "loss": 0.0017, + "num_tokens": 724184.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 193.4375, + "completions/mean_terminated_length": 193.4375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.00169725257239843, + "frac_reward_zero_std": 0.0, + "grad_norm": 90.5, + "kl": 0.042876197723671794, + "learning_rate": 7.994e-06, + "loss": 0.0017, + "num_tokens": 761606.0, + "reward": 2.927339792251587, + "reward_std": 0.06646312773227692, + "rewards/reward_fn/mean": 2.927339792251587, + "rewards/reward_fn/std": 0.06646312773227692, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 171.4375, + "completions/mean_terminated_length": 171.4375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.001803330858173332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.026966359262587503, + "learning_rate": 7.9936e-06, + "loss": 0.0011, + "num_tokens": 805396.0, + "reward": 2.997476577758789, + "reward_std": 0.1855943202972412, + "rewards/reward_fn/mean": 2.997476577758789, + "rewards/reward_fn/std": 0.1855943202972412, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 118.40625, + "completions/mean_terminated_length": 118.40625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.0019094091439482337, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.029081921704346314, + "learning_rate": 7.9932e-06, + "loss": 0.0012, + "num_tokens": 829697.0, + "reward": 3.9648869037628174, + "reward_std": 0.19862982630729675, + "rewards/reward_fn/mean": 3.9648869037628174, + "rewards/reward_fn/std": 0.19862982630729675, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 300.71875, + "completions/mean_terminated_length": 300.71875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.002015487429723136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.022967668395722285, + "learning_rate": 7.992799999999999e-06, + "loss": 0.0009, + "num_tokens": 875000.0, + "reward": 3.8433151245117188, + "reward_std": 0.42122551798820496, + "rewards/reward_fn/mean": 3.8433151245117188, + "rewards/reward_fn/std": 0.42122551798820496, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 129.53125, + "completions/mean_terminated_length": 129.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.0021215657154980377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.027954923105426133, + "learning_rate": 7.9924e-06, + "loss": 0.0011, + "num_tokens": 902569.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 145.59375, + "completions/mean_terminated_length": 145.59375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.0022276440012729396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.021872296027140692, + "learning_rate": 7.991999999999999e-06, + "loss": 0.0009, + "num_tokens": 922172.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 359.15625, + "completions/mean_terminated_length": 359.15625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.002333722287047841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.01314439470297657, + "learning_rate": 7.9916e-06, + "loss": 0.0005, + "num_tokens": 986177.0, + "reward": 2.712355613708496, + "reward_std": 0.04796939715743065, + "rewards/reward_fn/mean": 2.712355613708496, + "rewards/reward_fn/std": 0.04796938970685005, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 394.5625, + "completions/mean_terminated_length": 394.5625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.002439800572822743, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.022076091496273875, + "learning_rate": 7.991199999999999e-06, + "loss": 0.0009, + "num_tokens": 1037555.0, + "reward": 3.5815935134887695, + "reward_std": 0.5874396562576294, + "rewards/reward_fn/mean": 3.5815935134887695, + "rewards/reward_fn/std": 0.5874396562576294, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 312.3125, + "completions/mean_terminated_length": 312.3125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.002545878858597645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.022688235039822757, + "learning_rate": 7.9908e-06, + "loss": 0.0009, + "num_tokens": 1094173.0, + "reward": 2.881844997406006, + "reward_std": 0.045588310807943344, + "rewards/reward_fn/mean": 2.881844997406006, + "rewards/reward_fn/std": 0.04558834806084633, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 229.375, + "completions/mean_terminated_length": 229.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.002651957144372547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.012471042369725183, + "learning_rate": 7.9904e-06, + "loss": 0.0005, + "num_tokens": 1146825.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.0027580354301474487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.03129113302566111, + "learning_rate": 7.99e-06, + "loss": 0.0013, + "num_tokens": 1187497.0, + "reward": 2.8278141021728516, + "reward_std": 0.21510717272758484, + "rewards/reward_fn/mean": 2.8278141021728516, + "rewards/reward_fn/std": 0.21510712802410126, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0028641137159223506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.037364296382293105, + "learning_rate": 7.9896e-06, + "loss": 0.0015, + "num_tokens": 1220649.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 160.40625, + "completions/mean_terminated_length": 160.40625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.0029701920016972526, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.04326166835380718, + "learning_rate": 7.9892e-06, + "loss": 0.0017, + "num_tokens": 1274870.0, + "reward": 3.8869001865386963, + "reward_std": 0.35727038979530334, + "rewards/reward_fn/mean": 3.8869001865386963, + "rewards/reward_fn/std": 0.35727038979530334, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 368.28125, + "completions/mean_terminated_length": 368.28125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.0030762702874721545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.02703940650098957, + "learning_rate": 7.9888e-06, + "loss": 0.0011, + "num_tokens": 1327839.0, + "reward": 2.6013264656066895, + "reward_std": 0.3262932300567627, + "rewards/reward_fn/mean": 2.6013264656066895, + "rewards/reward_fn/std": 0.3262932002544403, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 377.5, + "completions/mean_terminated_length": 377.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.0031823485732470564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.021802546572871506, + "learning_rate": 7.9884e-06, + "loss": 0.0009, + "num_tokens": 1356015.0, + "reward": 2.6662087440490723, + "reward_std": 0.4428122639656067, + "rewards/reward_fn/mean": 2.6662087440490723, + "rewards/reward_fn/std": 0.4428122341632843, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 198.46875, + "completions/mean_terminated_length": 198.46875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.0032884268590219583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.02761831966927275, + "learning_rate": 7.988e-06, + "loss": 0.0011, + "num_tokens": 1397790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 202.78125, + "completions/mean_terminated_length": 202.78125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.00339450514479686, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.040427666739560664, + "learning_rate": 7.9876e-06, + "loss": 0.0016, + "num_tokens": 1441143.0, + "reward": 2.7268662452697754, + "reward_std": 0.034821733832359314, + "rewards/reward_fn/mean": 2.7268662452697754, + "rewards/reward_fn/std": 0.034821704030036926, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 204.6875, + "completions/mean_terminated_length": 204.6875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.003500583430571762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.018971540295751765, + "learning_rate": 7.987199999999999e-06, + "loss": 0.0008, + "num_tokens": 1482093.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 302.71875, + "completions/mean_terminated_length": 302.71875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.003606661716346664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.030103662895271555, + "learning_rate": 7.9868e-06, + "loss": 0.0012, + "num_tokens": 1529924.0, + "reward": 3.224987268447876, + "reward_std": 0.5399196147918701, + "rewards/reward_fn/mean": 3.224987268447876, + "rewards/reward_fn/std": 0.5399196147918701, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 281.1875, + "completions/mean_terminated_length": 281.1875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.003712740002121566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.03495570027735084, + "learning_rate": 7.986399999999999e-06, + "loss": 0.0014, + "num_tokens": 1571626.0, + "reward": 2.5815320014953613, + "reward_std": 0.1874743103981018, + "rewards/reward_fn/mean": 2.5815320014953613, + "rewards/reward_fn/std": 0.187474325299263, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 231.21875, + "completions/mean_terminated_length": 231.21875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.0038188182878964674, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.04162114462815225, + "learning_rate": 7.986e-06, + "loss": 0.0017, + "num_tokens": 1614289.0, + "reward": 2.7709145545959473, + "reward_std": 0.020474720746278763, + "rewards/reward_fn/mean": 2.7709145545959473, + "rewards/reward_fn/std": 0.02047473005950451, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 184.75, + "completions/mean_terminated_length": 184.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.00392489657367137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.048726535169407725, + "learning_rate": 7.9856e-06, + "loss": 0.0019, + "num_tokens": 1676041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.004030974859446272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.015456413209903985, + "learning_rate": 7.9852e-06, + "loss": 0.0006, + "num_tokens": 1725761.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 331.75, + "completions/mean_terminated_length": 331.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0041370531452211735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.03350905637489632, + "learning_rate": 7.9848e-06, + "loss": 0.0013, + "num_tokens": 1775257.0, + "reward": 2.9837241172790527, + "reward_std": 0.28284090757369995, + "rewards/reward_fn/mean": 2.9837241172790527, + "rewards/reward_fn/std": 0.28284087777137756, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 178.8125, + "completions/mean_terminated_length": 178.8125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.0042431314309960754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.026478524028789252, + "learning_rate": 7.9844e-06, + "loss": 0.0011, + "num_tokens": 1817203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 196.46875, + "completions/mean_terminated_length": 196.46875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.004349209716770977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.030847460555378348, + "learning_rate": 7.984e-06, + "loss": 0.0012, + "num_tokens": 1857922.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 671.46875, + "completions/mean_terminated_length": 671.46875, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.004455288002545879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.012836619760491885, + "learning_rate": 7.9836e-06, + "loss": 0.0005, + "num_tokens": 1915249.0, + "reward": 1.707290530204773, + "reward_std": 0.014385012909770012, + "rewards/reward_fn/mean": 1.707290530204773, + "rewards/reward_fn/std": 0.01438502874225378, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 239.125, + "completions/mean_terminated_length": 239.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.00456136628832078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.018241140787722543, + "learning_rate": 7.9832e-06, + "loss": 0.0007, + "num_tokens": 1965141.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.004667444574095682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.027864656323799863, + "learning_rate": 7.9828e-06, + "loss": 0.0011, + "num_tokens": 2010294.0, + "reward": 2.832035541534424, + "reward_std": 0.018766071647405624, + "rewards/reward_fn/mean": 2.832035541534424, + "rewards/reward_fn/std": 0.018766086548566818, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 114.28125, + "completions/mean_terminated_length": 114.28125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.004773522859870584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.03448771417606622, + "learning_rate": 7.9824e-06, + "loss": 0.0014, + "num_tokens": 2032671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 73.875, + "completions/mean_terminated_length": 73.875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.004879601145645486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.02053578832419589, + "learning_rate": 7.981999999999999e-06, + "loss": 0.0008, + "num_tokens": 2077403.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 381.21875, + "completions/mean_terminated_length": 381.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.004985679431420388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.028637878014706075, + "learning_rate": 7.9816e-06, + "loss": 0.0011, + "num_tokens": 2126082.0, + "reward": 2.7234487533569336, + "reward_std": 0.29123303294181824, + "rewards/reward_fn/mean": 2.7234487533569336, + "rewards/reward_fn/std": 0.29123303294181824, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 305.09375, + "completions/mean_terminated_length": 305.09375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.00509175771719529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.018045106873614714, + "learning_rate": 7.9812e-06, + "loss": 0.0007, + "num_tokens": 2184005.0, + "reward": 3.9637742042541504, + "reward_std": 0.20492403209209442, + "rewards/reward_fn/mean": 3.9637742042541504, + "rewards/reward_fn/std": 0.20492400228977203, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 523.65625, + "completions/mean_terminated_length": 523.65625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.005197836002970192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.0190808747720439, + "learning_rate": 7.9808e-06, + "loss": 0.0008, + "num_tokens": 2236858.0, + "reward": 2.6711881160736084, + "reward_std": 0.7798165082931519, + "rewards/reward_fn/mean": 2.6711881160736084, + "rewards/reward_fn/std": 0.7798165678977966, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 197.03125, + "completions/mean_terminated_length": 197.03125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.005303914288745094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.030177024600561708, + "learning_rate": 7.9804e-06, + "loss": 0.0012, + "num_tokens": 2276123.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 302.34375, + "completions/mean_terminated_length": 302.34375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.0054099925745199956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.025053055898752064, + "learning_rate": 7.98e-06, + "loss": 0.001, + "num_tokens": 2327142.0, + "reward": 2.815532922744751, + "reward_std": 0.061222709715366364, + "rewards/reward_fn/mean": 2.815532922744751, + "rewards/reward_fn/std": 0.06122272461652756, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 207.6875, + "completions/mean_terminated_length": 207.6875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.0055160708602948975, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.043745150382164866, + "learning_rate": 7.979599999999999e-06, + "loss": 0.0017, + "num_tokens": 2362652.0, + "reward": 3.8098015785217285, + "reward_std": 0.449843168258667, + "rewards/reward_fn/mean": 3.8098015785217285, + "rewards/reward_fn/std": 0.449843168258667, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 177.59375, + "completions/mean_terminated_length": 177.59375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.005622149146069799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.030419682036153972, + "learning_rate": 7.9792e-06, + "loss": 0.0012, + "num_tokens": 2413711.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 379.125, + "completions/mean_terminated_length": 379.125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.005728227431844701, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03096459503285587, + "learning_rate": 7.978799999999999e-06, + "loss": 0.0012, + "num_tokens": 2461139.0, + "reward": 2.792288303375244, + "reward_std": 0.22959093749523163, + "rewards/reward_fn/mean": 2.792288303375244, + "rewards/reward_fn/std": 0.22959090769290924, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 899.3125, + "completions/mean_terminated_length": 862.258056640625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "epoch": 0.005834305717619603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.013051759175141342, + "learning_rate": 7.9784e-06, + "loss": 0.0005, + "num_tokens": 2527389.0, + "reward": 2.574242115020752, + "reward_std": 0.390576034784317, + "rewards/reward_fn/mean": 2.574242115020752, + "rewards/reward_fn/std": 0.39057594537734985, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 341.96875, + "completions/mean_terminated_length": 341.96875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.005940384003394505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.01978624120238237, + "learning_rate": 7.977999999999999e-06, + "loss": 0.0008, + "num_tokens": 2569180.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 183.34375, + "completions/mean_terminated_length": 183.34375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.006046462289169407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.03518621769035235, + "learning_rate": 7.9776e-06, + "loss": 0.0014, + "num_tokens": 2628679.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 178.15625, + "completions/mean_terminated_length": 178.15625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.006152540574944309, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.010592784703476354, + "learning_rate": 7.977199999999999e-06, + "loss": 0.0004, + "num_tokens": 2680684.0, + "reward": 3.1400744915008545, + "reward_std": 0.01094813086092472, + "rewards/reward_fn/mean": 3.1400744915008545, + "rewards/reward_fn/std": 0.010948143899440765, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.006258618860719211, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0406050537712872, + "learning_rate": 7.9768e-06, + "loss": 0.0016, + "num_tokens": 2721064.0, + "reward": 2.754612922668457, + "reward_std": 0.024560745805501938, + "rewards/reward_fn/mean": 2.754612922668457, + "rewards/reward_fn/std": 0.02456069365143776, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 260.6875, + "completions/mean_terminated_length": 260.6875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.006364697146494113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.03231381287332624, + "learning_rate": 7.9764e-06, + "loss": 0.0013, + "num_tokens": 2783166.0, + "reward": 2.8348751068115234, + "reward_std": 0.021478639915585518, + "rewards/reward_fn/mean": 2.8348751068115234, + "rewards/reward_fn/std": 0.0214786846190691, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 177.40625, + "completions/mean_terminated_length": 177.40625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.006470775432269015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.03543406492099166, + "learning_rate": 7.976e-06, + "loss": 0.0014, + "num_tokens": 2835147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 298.71875, + "completions/mean_terminated_length": 298.71875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.0065768537180439166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.02661313521093689, + "learning_rate": 7.9756e-06, + "loss": 0.0011, + "num_tokens": 2874050.0, + "reward": 2.8492093086242676, + "reward_std": 0.18044866621494293, + "rewards/reward_fn/mean": 2.8492093086242676, + "rewards/reward_fn/std": 0.18044860661029816, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0066829320038188185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.02399728394811973, + "learning_rate": 7.9752e-06, + "loss": 0.001, + "num_tokens": 2914209.0, + "reward": 3.966371774673462, + "reward_std": 0.19022996723651886, + "rewards/reward_fn/mean": 3.966371774673462, + "rewards/reward_fn/std": 0.19022996723651886, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 160.90625, + "completions/mean_terminated_length": 160.90625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.00678901028959372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.021722051285905764, + "learning_rate": 7.9748e-06, + "loss": 0.0009, + "num_tokens": 2953118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 327.25, + "completions/mean_terminated_length": 327.25, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.006895088575368622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.026302733516786247, + "learning_rate": 7.9744e-06, + "loss": 0.0011, + "num_tokens": 3001318.0, + "reward": 3.931765079498291, + "reward_std": 0.26856717467308044, + "rewards/reward_fn/mean": 3.931765079498291, + "rewards/reward_fn/std": 0.26856720447540283, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 387.625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.007001166861143524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.026840948354220018, + "learning_rate": 7.974e-06, + "loss": 0.0011, + "num_tokens": 3080314.0, + "reward": 3.5707976818084717, + "reward_std": 0.9079517722129822, + "rewards/reward_fn/mean": 3.5707976818084717, + "rewards/reward_fn/std": 0.9079517126083374, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 292.9375, + "completions/mean_terminated_length": 292.9375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.007107245146918426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.026440918241860345, + "learning_rate": 7.9736e-06, + "loss": 0.0011, + "num_tokens": 3126872.0, + "reward": 3.963066339492798, + "reward_std": 0.2089284509420395, + "rewards/reward_fn/mean": 3.963066339492798, + "rewards/reward_fn/std": 0.20892846584320068, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 238.3125, + "completions/mean_terminated_length": 238.3125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.007213323432693328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.04070335888536647, + "learning_rate": 7.9732e-06, + "loss": 0.0016, + "num_tokens": 3171234.0, + "reward": 2.782970905303955, + "reward_std": 0.2698141932487488, + "rewards/reward_fn/mean": 2.782970905303955, + "rewards/reward_fn/std": 0.2698141932487488, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 145.9375, + "completions/mean_terminated_length": 145.9375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.00731940171846823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.039589619380421937, + "learning_rate": 7.9728e-06, + "loss": 0.0016, + "num_tokens": 3219968.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 159.4375, + "completions/mean_terminated_length": 159.4375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.007425480004243132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.044402167259249836, + "learning_rate": 7.9724e-06, + "loss": 0.0018, + "num_tokens": 3261806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 536.28125, + "completions/mean_terminated_length": 536.28125, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.007531558290018034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.017754276690538973, + "learning_rate": 7.972e-06, + "loss": 0.0007, + "num_tokens": 3326647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 395.625, + "completions/mean_terminated_length": 395.625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.007637636575792935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.02606621669838205, + "learning_rate": 7.9716e-06, + "loss": 0.001, + "num_tokens": 3371147.0, + "reward": 2.7468667030334473, + "reward_std": 0.029990501701831818, + "rewards/reward_fn/mean": 2.7468667030334473, + "rewards/reward_fn/std": 0.029990488663315773, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 129.21875, + "completions/mean_terminated_length": 129.21875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.007743714861567837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.035898417234420776, + "learning_rate": 7.9712e-06, + "loss": 0.0014, + "num_tokens": 3416178.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 210.5, + "completions/mean_terminated_length": 210.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.00784979314734274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.0296817320631817, + "learning_rate": 7.9708e-06, + "loss": 0.0012, + "num_tokens": 3470530.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 286.75, + "completions/mean_terminated_length": 286.75, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.00795587143311764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.02990246523404494, + "learning_rate": 7.970399999999999e-06, + "loss": 0.0012, + "num_tokens": 3535866.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 66.375, + "completions/mean_terminated_length": 66.375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.008061949718892543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3203125, + "kl": 0.04533489595633, + "learning_rate": 7.97e-06, + "loss": 0.0018, + "num_tokens": 3588262.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 330.9375, + "completions/mean_terminated_length": 330.9375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.008168028004667444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.023781975061865523, + "learning_rate": 7.969599999999999e-06, + "loss": 0.001, + "num_tokens": 3637028.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 84.1875, + "completions/mean_terminated_length": 84.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.008274106290442347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.04825737385544926, + "learning_rate": 7.9692e-06, + "loss": 0.0019, + "num_tokens": 3682186.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 352.6875, + "completions/mean_terminated_length": 352.6875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.008380184576217248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.01972204475896433, + "learning_rate": 7.968799999999999e-06, + "loss": 0.0008, + "num_tokens": 3765056.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 293.71875, + "completions/mean_terminated_length": 293.71875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.008486262861992151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.01990896329516545, + "learning_rate": 7.9684e-06, + "loss": 0.0008, + "num_tokens": 3821559.0, + "reward": 2.8117334842681885, + "reward_std": 0.027212122455239296, + "rewards/reward_fn/mean": 2.8117334842681885, + "rewards/reward_fn/std": 0.027212098240852356, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 158.65625, + "completions/mean_terminated_length": 158.65625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.008592341147767052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.03137456753756851, + "learning_rate": 7.967999999999999e-06, + "loss": 0.0013, + "num_tokens": 3855596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 185.59375, + "completions/mean_terminated_length": 185.59375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.008698419433541955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.04182742937700823, + "learning_rate": 7.9676e-06, + "loss": 0.0017, + "num_tokens": 3903775.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 245.65625, + "completions/mean_terminated_length": 245.65625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.008804497719316856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.02975867863278836, + "learning_rate": 7.967199999999999e-06, + "loss": 0.0012, + "num_tokens": 3946388.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 163.53125, + "completions/mean_terminated_length": 163.53125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.008910576005091759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.0318527152412571, + "learning_rate": 7.9668e-06, + "loss": 0.0013, + "num_tokens": 3979045.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 284.46875, + "completions/mean_terminated_length": 284.46875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.00901665429086666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.025949590606614947, + "learning_rate": 7.9664e-06, + "loss": 0.001, + "num_tokens": 4022452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 154.96875, + "completions/mean_terminated_length": 154.96875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.00912273257664156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1513671875, + "kl": 0.044943351356778294, + "learning_rate": 7.966e-06, + "loss": 0.0018, + "num_tokens": 4055379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 173.25, + "completions/mean_terminated_length": 173.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.009228810862416463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.03375592204974964, + "learning_rate": 7.9656e-06, + "loss": 0.0014, + "num_tokens": 4095131.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 232.5, + "completions/mean_terminated_length": 232.5, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.009334889148191364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.040917065110988915, + "learning_rate": 7.9652e-06, + "loss": 0.0016, + "num_tokens": 4133579.0, + "reward": 3.385199785232544, + "reward_std": 0.6248396635055542, + "rewards/reward_fn/mean": 3.385199785232544, + "rewards/reward_fn/std": 0.6248396635055542, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.009440967433966267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.03094743611291051, + "learning_rate": 7.9648e-06, + "loss": 0.0012, + "num_tokens": 4168399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 254.75, + "completions/mean_terminated_length": 254.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.009547045719741168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.020157222083071247, + "learning_rate": 7.9644e-06, + "loss": 0.0008, + "num_tokens": 4212263.0, + "reward": 2.7611052989959717, + "reward_std": 0.04510229453444481, + "rewards/reward_fn/mean": 2.7611052989959717, + "rewards/reward_fn/std": 0.0451023168861866, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 228.71875, + "completions/mean_terminated_length": 228.71875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.009653124005516071, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.025626638293033466, + "learning_rate": 7.964e-06, + "loss": 0.001, + "num_tokens": 4264894.0, + "reward": 3.182257652282715, + "reward_std": 0.5200350284576416, + "rewards/reward_fn/mean": 3.182257652282715, + "rewards/reward_fn/std": 0.5200349688529968, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 388.15625, + "completions/mean_terminated_length": 388.15625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.009759202291290972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.02052137372083962, + "learning_rate": 7.9636e-06, + "loss": 0.0008, + "num_tokens": 4326723.0, + "reward": 3.77706241607666, + "reward_std": 0.5531520247459412, + "rewards/reward_fn/mean": 3.77706241607666, + "rewards/reward_fn/std": 0.5531519651412964, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 149.53125, + "completions/mean_terminated_length": 149.53125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.009865280577065875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.025205849786289036, + "learning_rate": 7.963199999999999e-06, + "loss": 0.001, + "num_tokens": 4361044.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 967.4375, + "completions/mean_terminated_length": 813.0714721679688, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.009971358862840776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.01375104625185486, + "learning_rate": 7.9628e-06, + "loss": 0.0006, + "num_tokens": 4422434.0, + "reward": 2.3512930870056152, + "reward_std": 0.9757513999938965, + "rewards/reward_fn/mean": 2.3512930870056152, + "rewards/reward_fn/std": 0.9757513999938965, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 264.34375, + "completions/mean_terminated_length": 264.34375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.010077437148615679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.03294429712696001, + "learning_rate": 7.962399999999999e-06, + "loss": 0.0013, + "num_tokens": 4460301.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1975.0, + "completions/mean_length": 1142.59375, + "completions/mean_terminated_length": 1142.59375, + "completions/min_length": 641.0, + "completions/min_terminated_length": 641.0, + "epoch": 0.01018351543439058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.484375, + "kl": 0.011026079504517838, + "learning_rate": 7.962e-06, + "loss": 0.0004, + "num_tokens": 4529856.0, + "reward": 2.711195468902588, + "reward_std": 0.18870149552822113, + "rewards/reward_fn/mean": 2.711195468902588, + "rewards/reward_fn/std": 0.18870148062705994, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 215.78125, + "completions/mean_terminated_length": 215.78125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.010289593720165482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.02338853490073234, + "learning_rate": 7.9616e-06, + "loss": 0.0009, + "num_tokens": 4571801.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 508.03125, + "completions/mean_terminated_length": 508.03125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.010395672005940384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.012433522861101665, + "learning_rate": 7.9612e-06, + "loss": 0.0005, + "num_tokens": 4632890.0, + "reward": 3.551553726196289, + "reward_std": 0.6977203488349915, + "rewards/reward_fn/mean": 3.551553726196289, + "rewards/reward_fn/std": 0.6977203488349915, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 503.21875, + "completions/mean_terminated_length": 503.21875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.010501750291715286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.01728816461400129, + "learning_rate": 7.9608e-06, + "loss": 0.0007, + "num_tokens": 4686113.0, + "reward": 2.795738458633423, + "reward_std": 0.05973564833402634, + "rewards/reward_fn/mean": 2.795738458633423, + "rewards/reward_fn/std": 0.05973568186163902, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 179.78125, + "completions/mean_terminated_length": 179.78125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.010607828577490187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.037204470427241176, + "learning_rate": 7.9604e-06, + "loss": 0.0015, + "num_tokens": 4720314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 248.46875, + "completions/mean_terminated_length": 248.46875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.01071390686326509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.027893651509657502, + "learning_rate": 7.96e-06, + "loss": 0.0011, + "num_tokens": 4770729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 218.3125, + "completions/mean_terminated_length": 218.3125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.010819985149039991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.027056844613980502, + "learning_rate": 7.959599999999999e-06, + "loss": 0.0011, + "num_tokens": 4809267.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 85.6875, + "completions/mean_terminated_length": 85.6875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.010926063434814894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.015027127868961543, + "learning_rate": 7.9592e-06, + "loss": 0.0006, + "num_tokens": 4831881.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 189.625, + "completions/mean_terminated_length": 189.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.011032141720589795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.02954344844329171, + "learning_rate": 7.958799999999999e-06, + "loss": 0.0012, + "num_tokens": 4893149.0, + "reward": 2.8656177520751953, + "reward_std": 0.012954896315932274, + "rewards/reward_fn/mean": 2.8656177520751953, + "rewards/reward_fn/std": 0.012954906560480595, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 143.34375, + "completions/mean_terminated_length": 143.34375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.011138220006364698, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.04420140647562221, + "learning_rate": 7.9584e-06, + "loss": 0.0018, + "num_tokens": 4914216.0, + "reward": 3.203092575073242, + "reward_std": 0.031201422214508057, + "rewards/reward_fn/mean": 3.203092575073242, + "rewards/reward_fn/std": 0.031201381236314774, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 191.0625, + "completions/mean_terminated_length": 191.0625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.011244298292139599, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.024419172026682645, + "learning_rate": 7.957999999999999e-06, + "loss": 0.001, + "num_tokens": 4955690.0, + "reward": 2.354905843734741, + "reward_std": 0.586423397064209, + "rewards/reward_fn/mean": 2.354905843734741, + "rewards/reward_fn/std": 0.586423397064209, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 202.71875, + "completions/mean_terminated_length": 202.71875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.011350376577914502, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.02303978052805178, + "learning_rate": 7.9576e-06, + "loss": 0.0009, + "num_tokens": 4998529.0, + "reward": 3.2449896335601807, + "reward_std": 0.5573272705078125, + "rewards/reward_fn/mean": 3.2449896335601807, + "rewards/reward_fn/std": 0.5573272109031677, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 62.75, + "completions/mean_terminated_length": 62.75, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.011456454863689403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.02382526727160439, + "learning_rate": 7.9572e-06, + "loss": 0.001, + "num_tokens": 5035353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1877.0, + "completions/max_terminated_length": 1877.0, + "completions/mean_length": 1008.65625, + "completions/mean_terminated_length": 1008.65625, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.011562533149464305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.011780590401031077, + "learning_rate": 7.9568e-06, + "loss": 0.0005, + "num_tokens": 5094030.0, + "reward": 2.959881067276001, + "reward_std": 0.9156700968742371, + "rewards/reward_fn/mean": 2.959881067276001, + "rewards/reward_fn/std": 0.9156700968742371, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 280.09375, + "completions/mean_terminated_length": 280.09375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.011668611435239206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.021801961964229122, + "learning_rate": 7.9564e-06, + "loss": 0.0009, + "num_tokens": 5138897.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 207.96875, + "completions/mean_terminated_length": 207.96875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.01177468972101411, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.03097967169014737, + "learning_rate": 7.956e-06, + "loss": 0.0012, + "num_tokens": 5176816.0, + "reward": 3.800929069519043, + "reward_std": 0.5765722393989563, + "rewards/reward_fn/mean": 3.800929069519043, + "rewards/reward_fn/std": 0.5765722393989563, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 205.53125, + "completions/mean_terminated_length": 205.53125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.01188076800678901, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.043233627162408084, + "learning_rate": 7.955599999999999e-06, + "loss": 0.0017, + "num_tokens": 5227073.0, + "reward": 2.8226916790008545, + "reward_std": 0.274354487657547, + "rewards/reward_fn/mean": 2.8226916790008545, + "rewards/reward_fn/std": 0.2743545174598694, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 286.84375, + "completions/mean_terminated_length": 286.84375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.011986846292563913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.022725433052983135, + "learning_rate": 7.9552e-06, + "loss": 0.0009, + "num_tokens": 5274268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 77.78125, + "completions/mean_terminated_length": 77.78125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.012092924578338814, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.09375, + "kl": 0.03250217955792323, + "learning_rate": 7.954799999999999e-06, + "loss": 0.0013, + "num_tokens": 5313717.0, + "reward": 3.0405046939849854, + "reward_std": 0.030798695981502533, + "rewards/reward_fn/mean": 3.0405046939849854, + "rewards/reward_fn/std": 0.030798697844147682, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 346.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.012199002864113715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.019344605010701343, + "learning_rate": 7.9544e-06, + "loss": 0.0008, + "num_tokens": 5368593.0, + "reward": 3.050971031188965, + "reward_std": 0.5881122350692749, + "rewards/reward_fn/mean": 3.050971031188965, + "rewards/reward_fn/std": 0.5881122350692749, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 416.59375, + "completions/mean_terminated_length": 416.59375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.012305081149888618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.02247777715092525, + "learning_rate": 7.953999999999999e-06, + "loss": 0.0009, + "num_tokens": 5419044.0, + "reward": 2.8257861137390137, + "reward_std": 0.029950875788927078, + "rewards/reward_fn/mean": 2.8257861137390137, + "rewards/reward_fn/std": 0.029950888827443123, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 392.9375, + "completions/mean_terminated_length": 392.9375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.012411159435663519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.02738574449904263, + "learning_rate": 7.9536e-06, + "loss": 0.0011, + "num_tokens": 5474146.0, + "reward": 3.4753129482269287, + "reward_std": 0.6120408773422241, + "rewards/reward_fn/mean": 3.4753129482269287, + "rewards/reward_fn/std": 0.6120408773422241, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 239.21875, + "completions/mean_terminated_length": 239.21875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.012517237721438422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.02117563263163902, + "learning_rate": 7.953199999999999e-06, + "loss": 0.0008, + "num_tokens": 5515369.0, + "reward": 2.897916316986084, + "reward_std": 0.20252078771591187, + "rewards/reward_fn/mean": 2.897916316986084, + "rewards/reward_fn/std": 0.20252081751823425, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.0, + "completions/max_terminated_length": 1576.0, + "completions/mean_length": 434.59375, + "completions/mean_terminated_length": 434.59375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.012623316007213323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.029518495866796002, + "learning_rate": 7.9528e-06, + "loss": 0.0012, + "num_tokens": 5561532.0, + "reward": 2.9283552169799805, + "reward_std": 0.04649563878774643, + "rewards/reward_fn/mean": 2.9283552169799805, + "rewards/reward_fn/std": 0.046495646238327026, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 170.46875, + "completions/mean_terminated_length": 170.46875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.012729394292988225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.031327863136539236, + "learning_rate": 7.9524e-06, + "loss": 0.0013, + "num_tokens": 5597931.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 213.5625, + "completions/mean_terminated_length": 213.5625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.012835472578763127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.05756760714575648, + "learning_rate": 7.952e-06, + "loss": 0.0023, + "num_tokens": 5639261.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 207.84375, + "completions/mean_terminated_length": 207.84375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.01294155086453803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.05318293231539428, + "learning_rate": 7.9516e-06, + "loss": 0.0021, + "num_tokens": 5662744.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.01304762915031293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.05158931959886104, + "learning_rate": 7.9512e-06, + "loss": 0.0021, + "num_tokens": 5704862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 184.1875, + "completions/mean_terminated_length": 184.1875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.013153707436087833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.048936213250271976, + "learning_rate": 7.9508e-06, + "loss": 0.002, + "num_tokens": 5736100.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 410.5625, + "completions/mean_terminated_length": 410.5625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.013259785721862734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.03778961458010599, + "learning_rate": 7.9504e-06, + "loss": 0.0015, + "num_tokens": 5796438.0, + "reward": 3.253349542617798, + "reward_std": 0.24341487884521484, + "rewards/reward_fn/mean": 3.253349542617798, + "rewards/reward_fn/std": 0.24341486394405365, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 238.21875, + "completions/mean_terminated_length": 238.21875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.013365864007637637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.03385805507423356, + "learning_rate": 7.95e-06, + "loss": 0.0014, + "num_tokens": 5864573.0, + "reward": 2.7737503051757812, + "reward_std": 0.32643312215805054, + "rewards/reward_fn/mean": 2.7737503051757812, + "rewards/reward_fn/std": 0.32643312215805054, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 96.03125, + "completions/mean_terminated_length": 96.03125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.013471942293412538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.029550763370934874, + "learning_rate": 7.9496e-06, + "loss": 0.0012, + "num_tokens": 5897438.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 169.96875, + "completions/mean_terminated_length": 169.96875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.01357802057918744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.03915034799138084, + "learning_rate": 7.9492e-06, + "loss": 0.0016, + "num_tokens": 5940637.0, + "reward": 3.2658727169036865, + "reward_std": 0.14581863582134247, + "rewards/reward_fn/mean": 3.2658727169036865, + "rewards/reward_fn/std": 0.14581862092018127, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 145.78125, + "completions/mean_terminated_length": 145.78125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.013684098864962342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.0380926345824264, + "learning_rate": 7.9488e-06, + "loss": 0.0015, + "num_tokens": 5976566.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 125.0, + "completions/mean_terminated_length": 125.0, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.013790177150737245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.0405671053158585, + "learning_rate": 7.9484e-06, + "loss": 0.0016, + "num_tokens": 6020598.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.375, + "completions/mean_terminated_length": 80.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.013896255436512146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.03241157752927393, + "learning_rate": 7.948e-06, + "loss": 0.0013, + "num_tokens": 6063330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 430.46875, + "completions/mean_terminated_length": 430.46875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.014002333722287048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.024634861125377938, + "learning_rate": 7.9476e-06, + "loss": 0.001, + "num_tokens": 6113649.0, + "reward": 3.187520980834961, + "reward_std": 0.4377913773059845, + "rewards/reward_fn/mean": 3.187520980834961, + "rewards/reward_fn/std": 0.4377914071083069, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 260.34375, + "completions/mean_terminated_length": 260.34375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.01410841200806195, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.04265774926170707, + "learning_rate": 7.947199999999999e-06, + "loss": 0.0017, + "num_tokens": 6137468.0, + "reward": 3.061521530151367, + "reward_std": 0.5167216062545776, + "rewards/reward_fn/mean": 3.061521530151367, + "rewards/reward_fn/std": 0.5167215466499329, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 107.59375, + "completions/mean_terminated_length": 107.59375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.014214490293836852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.022619035269599408, + "learning_rate": 7.9468e-06, + "loss": 0.0009, + "num_tokens": 6175791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 195.75, + "completions/mean_terminated_length": 195.75, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.014320568579611753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.026089665334438905, + "learning_rate": 7.946399999999999e-06, + "loss": 0.001, + "num_tokens": 6213511.0, + "reward": 2.8176023960113525, + "reward_std": 0.3619799017906189, + "rewards/reward_fn/mean": 2.8176023960113525, + "rewards/reward_fn/std": 0.3619799017906189, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 180.71875, + "completions/mean_terminated_length": 180.71875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.014426646865386656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.040278222353663296, + "learning_rate": 7.946e-06, + "loss": 0.0016, + "num_tokens": 6256158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 459.96875, + "completions/mean_terminated_length": 459.96875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.014532725151161557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.024632773885969073, + "learning_rate": 7.945599999999999e-06, + "loss": 0.001, + "num_tokens": 6306045.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 723.0, + "completions/mean_terminated_length": 680.258056640625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.01463880343693646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.027096317993709818, + "learning_rate": 7.9452e-06, + "loss": 0.0011, + "num_tokens": 6360445.0, + "reward": 2.2733407020568848, + "reward_std": 0.8242188096046448, + "rewards/reward_fn/mean": 2.2733407020568848, + "rewards/reward_fn/std": 0.8242188096046448, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 603.75, + "completions/mean_terminated_length": 603.75, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.01474488172271136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.02166763847344555, + "learning_rate": 7.944799999999999e-06, + "loss": 0.0009, + "num_tokens": 6415573.0, + "reward": 2.891838312149048, + "reward_std": 0.06433243304491043, + "rewards/reward_fn/mean": 2.891838312149048, + "rewards/reward_fn/std": 0.06433244049549103, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 187.6875, + "completions/mean_terminated_length": 187.6875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.014850960008486264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.027505328936968, + "learning_rate": 7.9444e-06, + "loss": 0.0011, + "num_tokens": 6463531.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 128.375, + "completions/mean_terminated_length": 128.375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.014957038294261165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.026608427229803056, + "learning_rate": 7.943999999999999e-06, + "loss": 0.0011, + "num_tokens": 6516215.0, + "reward": 3.2211999893188477, + "reward_std": 0.14292843639850616, + "rewards/reward_fn/mean": 3.2211999893188477, + "rewards/reward_fn/std": 0.14292845129966736, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.015063116580036067, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.047853924334049225, + "learning_rate": 7.9436e-06, + "loss": 0.0019, + "num_tokens": 6559991.0, + "reward": 2.7001118659973145, + "reward_std": 0.026461286470294, + "rewards/reward_fn/mean": 2.7001118659973145, + "rewards/reward_fn/std": 0.026461288332939148, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 365.40625, + "completions/mean_terminated_length": 365.40625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.015169194865810968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.038805975636933, + "learning_rate": 7.9432e-06, + "loss": 0.0016, + "num_tokens": 6606052.0, + "reward": 2.986452579498291, + "reward_std": 0.7081362009048462, + "rewards/reward_fn/mean": 2.986452579498291, + "rewards/reward_fn/std": 0.7081362009048462, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 162.875, + "completions/mean_terminated_length": 162.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.01527527315158587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.021103895822307095, + "learning_rate": 7.9428e-06, + "loss": 0.0008, + "num_tokens": 6640832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 235.125, + "completions/mean_terminated_length": 235.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.015381351437360772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0223095424589701, + "learning_rate": 7.9424e-06, + "loss": 0.0009, + "num_tokens": 6686628.0, + "reward": 3.0702104568481445, + "reward_std": 0.591320812702179, + "rewards/reward_fn/mean": 3.0702104568481445, + "rewards/reward_fn/std": 0.5913207530975342, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 211.34375, + "completions/mean_terminated_length": 211.34375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.015487429723135673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.027028481126762927, + "learning_rate": 7.942e-06, + "loss": 0.0011, + "num_tokens": 6731471.0, + "reward": 3.1773626804351807, + "reward_std": 0.22673729062080383, + "rewards/reward_fn/mean": 3.1773626804351807, + "rewards/reward_fn/std": 0.22673727571964264, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 220.125, + "completions/mean_terminated_length": 220.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.015593508008910576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.039822932973038405, + "learning_rate": 7.9416e-06, + "loss": 0.0016, + "num_tokens": 6775379.0, + "reward": 2.9473490715026855, + "reward_std": 0.40440940856933594, + "rewards/reward_fn/mean": 2.9473490715026855, + "rewards/reward_fn/std": 0.40440940856933594, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 575.1875, + "completions/mean_terminated_length": 575.1875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.01569958629468548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.023320580541621894, + "learning_rate": 7.9412e-06, + "loss": 0.0009, + "num_tokens": 6835545.0, + "reward": 2.5726113319396973, + "reward_std": 0.3018931448459625, + "rewards/reward_fn/mean": 2.5726113319396973, + "rewards/reward_fn/std": 0.3018931448459625, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 104.375, + "completions/mean_terminated_length": 104.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.015805664580460378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21484375, + "kl": 0.032563303771894425, + "learning_rate": 7.9408e-06, + "loss": 0.0013, + "num_tokens": 6872037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 155.8125, + "completions/mean_terminated_length": 155.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.01591174286623528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.04502526589203626, + "learning_rate": 7.9404e-06, + "loss": 0.0018, + "num_tokens": 6905183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 223.1875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.016017821152010184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.039221919083502144, + "learning_rate": 7.94e-06, + "loss": 0.0016, + "num_tokens": 6945445.0, + "reward": 3.901937961578369, + "reward_std": 0.3098265826702118, + "rewards/reward_fn/mean": 3.901937961578369, + "rewards/reward_fn/std": 0.3098265528678894, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 338.75, + "completions/mean_terminated_length": 338.75, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.016123899437785087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03526619606418535, + "learning_rate": 7.9396e-06, + "loss": 0.0014, + "num_tokens": 6993149.0, + "reward": 3.3024063110351562, + "reward_std": 0.651430606842041, + "rewards/reward_fn/mean": 3.3024063110351562, + "rewards/reward_fn/std": 0.6514305472373962, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.016229977723559986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.04323679523076862, + "learning_rate": 7.939199999999998e-06, + "loss": 0.0017, + "num_tokens": 7032805.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 120.90625, + "completions/mean_terminated_length": 120.90625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.01633605600933489, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.06397866155020893, + "learning_rate": 7.9388e-06, + "loss": 0.0025, + "num_tokens": 7065250.0, + "reward": 2.901592969894409, + "reward_std": 0.022638218477368355, + "rewards/reward_fn/mean": 2.901592969894409, + "rewards/reward_fn/std": 0.02263822965323925, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.01644213429510979, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.06367940676864237, + "learning_rate": 7.9384e-06, + "loss": 0.0025, + "num_tokens": 7111198.0, + "reward": 3.8250930309295654, + "reward_std": 0.3699547052383423, + "rewards/reward_fn/mean": 3.8250930309295654, + "rewards/reward_fn/std": 0.3699546456336975, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 264.8125, + "completions/mean_terminated_length": 264.8125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.016548212580884694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.048206568928435445, + "learning_rate": 7.938e-06, + "loss": 0.0019, + "num_tokens": 7173912.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 114.21875, + "completions/mean_terminated_length": 114.21875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.016654290866659593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.04626023437594995, + "learning_rate": 7.9376e-06, + "loss": 0.0019, + "num_tokens": 7205311.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 117.5625, + "completions/mean_terminated_length": 117.5625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.016760369152434496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.023312504577916116, + "learning_rate": 7.9372e-06, + "loss": 0.0009, + "num_tokens": 7239921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 299.125, + "completions/mean_terminated_length": 299.125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.0168664474382094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.048653680307324976, + "learning_rate": 7.9368e-06, + "loss": 0.0019, + "num_tokens": 7282933.0, + "reward": 3.4297356605529785, + "reward_std": 0.4795074164867401, + "rewards/reward_fn/mean": 3.4297356605529785, + "rewards/reward_fn/std": 0.4795074760913849, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 310.4375, + "completions/mean_terminated_length": 310.4375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.016972525723984302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.03421498887473717, + "learning_rate": 7.936399999999999e-06, + "loss": 0.0014, + "num_tokens": 7322243.0, + "reward": 2.825167179107666, + "reward_std": 0.0166213046759367, + "rewards/reward_fn/mean": 2.825167179107666, + "rewards/reward_fn/std": 0.01662134751677513, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 223.84375, + "completions/mean_terminated_length": 223.84375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.0170786040097592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.048001356422901154, + "learning_rate": 7.936e-06, + "loss": 0.0019, + "num_tokens": 7358174.0, + "reward": 1.793137788772583, + "reward_std": 0.006353580858558416, + "rewards/reward_fn/mean": 1.793137788772583, + "rewards/reward_fn/std": 0.006353587377816439, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.017184682295534104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.04288411437300965, + "learning_rate": 7.935599999999999e-06, + "loss": 0.0017, + "num_tokens": 7395775.0, + "reward": 2.704394817352295, + "reward_std": 0.028164710849523544, + "rewards/reward_fn/mean": 2.704394817352295, + "rewards/reward_fn/std": 0.028164727613329887, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 128.15625, + "completions/mean_terminated_length": 128.15625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.017290760581309007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.228515625, + "kl": 0.029513808840420097, + "learning_rate": 7.9352e-06, + "loss": 0.0012, + "num_tokens": 7441220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 274.4375, + "completions/mean_terminated_length": 274.4375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.01739683886708391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.03132422378985211, + "learning_rate": 7.934799999999999e-06, + "loss": 0.0013, + "num_tokens": 7485106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 328.625, + "completions/mean_terminated_length": 273.1612854003906, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.01750291715285881, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0340282313991338, + "learning_rate": 7.9344e-06, + "loss": 0.0014, + "num_tokens": 7507526.0, + "reward": 3.6591928005218506, + "reward_std": 0.8599532246589661, + "rewards/reward_fn/mean": 3.6591928005218506, + "rewards/reward_fn/std": 0.8599532842636108, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 228.84375, + "completions/mean_terminated_length": 228.84375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.01760899543863371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.061049588199239224, + "learning_rate": 7.934e-06, + "loss": 0.0024, + "num_tokens": 7561217.0, + "reward": 3.0768370628356934, + "reward_std": 0.06554757058620453, + "rewards/reward_fn/mean": 3.0768370628356934, + "rewards/reward_fn/std": 0.06554758548736572, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 73.96875, + "completions/mean_terminated_length": 73.96875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.017715073724408614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.232421875, + "kl": 0.03734696819446981, + "learning_rate": 7.9336e-06, + "loss": 0.0015, + "num_tokens": 7605536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.017821152010183517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.05416923708980903, + "learning_rate": 7.9332e-06, + "loss": 0.0022, + "num_tokens": 7651144.0, + "reward": 2.543686628341675, + "reward_std": 0.5045425891876221, + "rewards/reward_fn/mean": 2.543686628341675, + "rewards/reward_fn/std": 0.5045425891876221, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.017927230295958416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.03905608196510002, + "learning_rate": 7.9328e-06, + "loss": 0.0016, + "num_tokens": 7701384.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1153.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 426.53125, + "completions/mean_terminated_length": 426.53125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.01803330858173332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.04825877823168412, + "learning_rate": 7.9324e-06, + "loss": 0.0019, + "num_tokens": 7751033.0, + "reward": 2.7124316692352295, + "reward_std": 0.5163800120353699, + "rewards/reward_fn/mean": 2.7124316692352295, + "rewards/reward_fn/std": 0.5163800716400146, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 180.21875, + "completions/mean_terminated_length": 180.21875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.018139386867508222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.04462099994998425, + "learning_rate": 7.932e-06, + "loss": 0.0018, + "num_tokens": 7798016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 116.46875, + "completions/mean_terminated_length": 116.46875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.01824546515328312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.173828125, + "kl": 0.04711482278071344, + "learning_rate": 7.9316e-06, + "loss": 0.0019, + "num_tokens": 7833263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 209.84375, + "completions/mean_terminated_length": 209.84375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.018351543439058024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.05251658370252699, + "learning_rate": 7.9312e-06, + "loss": 0.0021, + "num_tokens": 7879786.0, + "reward": 3.908167839050293, + "reward_std": 0.2901296019554138, + "rewards/reward_fn/mean": 3.908167839050293, + "rewards/reward_fn/std": 0.2901296317577362, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 197.90625, + "completions/mean_terminated_length": 197.90625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.018457621724832927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.019432318513281643, + "learning_rate": 7.930799999999999e-06, + "loss": 0.0008, + "num_tokens": 7924391.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 130.78125, + "completions/mean_terminated_length": 130.78125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.01856370001060783, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5, + "kl": 0.030577484285458922, + "learning_rate": 7.9304e-06, + "loss": 0.0012, + "num_tokens": 7962624.0, + "reward": 3.843151092529297, + "reward_std": 0.27601996064186096, + "rewards/reward_fn/mean": 3.843151092529297, + "rewards/reward_fn/std": 0.2760199308395386, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 104.78125, + "completions/mean_terminated_length": 104.78125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.01866977829638273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.240234375, + "kl": 0.052696100203320384, + "learning_rate": 7.929999999999999e-06, + "loss": 0.0021, + "num_tokens": 7989881.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 67.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 60.0, + "completions/mean_terminated_length": 60.0, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.01877585658215763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2255859375, + "kl": 0.04239236278226599, + "learning_rate": 7.9296e-06, + "loss": 0.0017, + "num_tokens": 8023833.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 165.8125, + "completions/mean_terminated_length": 165.8125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.018881934867932534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.029231760883703828, + "learning_rate": 7.9292e-06, + "loss": 0.0012, + "num_tokens": 8062675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.018988013153707437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.039269523753318936, + "learning_rate": 7.9288e-06, + "loss": 0.0016, + "num_tokens": 8108643.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 119.34375, + "completions/mean_terminated_length": 119.34375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.019094091439482336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.03951008943840861, + "learning_rate": 7.9284e-06, + "loss": 0.0016, + "num_tokens": 8143758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 166.84375, + "completions/mean_terminated_length": 166.84375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.01920016972525724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.02374349971069023, + "learning_rate": 7.928e-06, + "loss": 0.001, + "num_tokens": 8189033.0, + "reward": 3.7921862602233887, + "reward_std": 0.26086750626564026, + "rewards/reward_fn/mean": 3.7921862602233887, + "rewards/reward_fn/std": 0.26086747646331787, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 136.59375, + "completions/mean_terminated_length": 136.59375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.019306248011032142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1884765625, + "kl": 0.04318763309856877, + "learning_rate": 7.9276e-06, + "loss": 0.0017, + "num_tokens": 8215548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 258.0625, + "completions/mean_terminated_length": 258.0625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.019412326296807045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.05549651384353638, + "learning_rate": 7.9272e-06, + "loss": 0.0022, + "num_tokens": 8261726.0, + "reward": 2.7983384132385254, + "reward_std": 0.01264275424182415, + "rewards/reward_fn/mean": 2.7983384132385254, + "rewards/reward_fn/std": 0.01264276821166277, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 280.8125, + "completions/mean_terminated_length": 280.8125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.019518404582581944, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "kl": 0.03899583051679656, + "learning_rate": 7.9268e-06, + "loss": 0.0016, + "num_tokens": 8311064.0, + "reward": 3.5739221572875977, + "reward_std": 0.4618890881538391, + "rewards/reward_fn/mean": 3.5739221572875977, + "rewards/reward_fn/std": 0.46188902854919434, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 246.78125, + "completions/mean_terminated_length": 246.78125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.019624482868356847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.04396548826480284, + "learning_rate": 7.9264e-06, + "loss": 0.0018, + "num_tokens": 8348273.0, + "reward": 2.785221576690674, + "reward_std": 0.012262105010449886, + "rewards/reward_fn/mean": 2.785221576690674, + "rewards/reward_fn/std": 0.012262105941772461, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 591.90625, + "completions/mean_terminated_length": 494.8333740234375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.01973056115413175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.028339633194264024, + "learning_rate": 7.926e-06, + "loss": 0.0011, + "num_tokens": 8414830.0, + "reward": 2.7476210594177246, + "reward_std": 0.7238368391990662, + "rewards/reward_fn/mean": 2.7476210594177246, + "rewards/reward_fn/std": 0.7238367795944214, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 458.65625, + "completions/mean_terminated_length": 458.65625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.019836639439906652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.03971007274230942, + "learning_rate": 7.925599999999999e-06, + "loss": 0.0016, + "num_tokens": 8466883.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 336.09375, + "completions/mean_terminated_length": 336.09375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.019942717725681552, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.04593227559234947, + "learning_rate": 7.9252e-06, + "loss": 0.0018, + "num_tokens": 8511878.0, + "reward": 3.2010879516601562, + "reward_std": 0.09355498105287552, + "rewards/reward_fn/mean": 3.2010879516601562, + "rewards/reward_fn/std": 0.09355500340461731, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1489.0, + "completions/max_terminated_length": 1489.0, + "completions/mean_length": 444.09375, + "completions/mean_terminated_length": 444.09375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.020048796011456455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.034340374055318534, + "learning_rate": 7.9248e-06, + "loss": 0.0014, + "num_tokens": 8548201.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 563.90625, + "completions/mean_terminated_length": 563.90625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.020154874297231357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.037472407915629447, + "learning_rate": 7.9244e-06, + "loss": 0.0015, + "num_tokens": 8601350.0, + "reward": 2.781393527984619, + "reward_std": 0.017921049147844315, + "rewards/reward_fn/mean": 2.781393527984619, + "rewards/reward_fn/std": 0.017921047285199165, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1138.15625, + "completions/mean_terminated_length": 1044.034423828125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.02026095258300626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.027679344464559108, + "learning_rate": 7.924e-06, + "loss": 0.0011, + "num_tokens": 8675243.0, + "reward": 3.1059072017669678, + "reward_std": 1.2772547006607056, + "rewards/reward_fn/mean": 3.1059072017669678, + "rewards/reward_fn/std": 1.2772547006607056, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 160.3125, + "completions/mean_terminated_length": 160.3125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.02036703086878116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.02794185228412971, + "learning_rate": 7.9236e-06, + "loss": 0.0011, + "num_tokens": 8721653.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 102.46875, + "completions/mean_terminated_length": 102.46875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.020473109154556062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.041104476957116276, + "learning_rate": 7.923199999999999e-06, + "loss": 0.0016, + "num_tokens": 8746500.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 104.5, + "completions/mean_terminated_length": 104.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.020579187440330965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1923828125, + "kl": 0.03695642208913341, + "learning_rate": 7.9228e-06, + "loss": 0.0015, + "num_tokens": 8770548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 74.46875, + "completions/mean_terminated_length": 74.46875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.020685265726105868, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.34375, + "kl": 0.008636852275230922, + "learning_rate": 7.922399999999999e-06, + "loss": 0.0003, + "num_tokens": 8807427.0, + "reward": 3.072772264480591, + "reward_std": 0.03884429484605789, + "rewards/reward_fn/mean": 3.072772264480591, + "rewards/reward_fn/std": 0.03884435072541237, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 188.84375, + "completions/mean_terminated_length": 188.84375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.020791344011880767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.02465893269982189, + "learning_rate": 7.922e-06, + "loss": 0.001, + "num_tokens": 8833886.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 186.125, + "completions/mean_terminated_length": 186.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.02089742229765567, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.030734462081454694, + "learning_rate": 7.921599999999999e-06, + "loss": 0.0012, + "num_tokens": 8879938.0, + "reward": 2.784573554992676, + "reward_std": 0.02381717413663864, + "rewards/reward_fn/mean": 2.784573554992676, + "rewards/reward_fn/std": 0.02381720580160618, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 86.5625, + "completions/mean_terminated_length": 86.5625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.021003500583430573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28515625, + "kl": 0.07277392386458814, + "learning_rate": 7.9212e-06, + "loss": 0.0029, + "num_tokens": 8926068.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 194.125, + "completions/mean_terminated_length": 194.125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.021109578869205475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.03341373009607196, + "learning_rate": 7.920799999999999e-06, + "loss": 0.0013, + "num_tokens": 8962680.0, + "reward": 3.2303833961486816, + "reward_std": 0.48945605754852295, + "rewards/reward_fn/mean": 3.2303833961486816, + "rewards/reward_fn/std": 0.48945602774620056, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 164.84375, + "completions/mean_terminated_length": 164.84375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.021215657154980375, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.04750790970865637, + "learning_rate": 7.9204e-06, + "loss": 0.0019, + "num_tokens": 9005907.0, + "reward": 3.969318151473999, + "reward_std": 0.17356248199939728, + "rewards/reward_fn/mean": 3.969318151473999, + "rewards/reward_fn/std": 0.17356249690055847, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 206.5625, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.021321735440755277, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.038540702196769416, + "learning_rate": 7.92e-06, + "loss": 0.0015, + "num_tokens": 9044805.0, + "reward": 3.435892343521118, + "reward_std": 0.6499969959259033, + "rewards/reward_fn/mean": 3.435892343521118, + "rewards/reward_fn/std": 0.6499969959259033, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 309.75, + "completions/mean_terminated_length": 309.75, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.02142781372653018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02935608651023358, + "learning_rate": 7.9196e-06, + "loss": 0.0012, + "num_tokens": 9096125.0, + "reward": 3.915349006652832, + "reward_std": 0.26741769909858704, + "rewards/reward_fn/mean": 3.915349006652832, + "rewards/reward_fn/std": 0.26741769909858704, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 325.6875, + "completions/mean_terminated_length": 325.6875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.02153389201230508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.03687985229771584, + "learning_rate": 7.9192e-06, + "loss": 0.0015, + "num_tokens": 9147411.0, + "reward": 2.7406442165374756, + "reward_std": 0.027461480349302292, + "rewards/reward_fn/mean": 2.7406442165374756, + "rewards/reward_fn/std": 0.027461478486657143, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 221.0625, + "completions/mean_terminated_length": 221.0625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.021639970298079982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.04066753340885043, + "learning_rate": 7.9188e-06, + "loss": 0.0016, + "num_tokens": 9186165.0, + "reward": 3.394157886505127, + "reward_std": 0.269808292388916, + "rewards/reward_fn/mean": 3.394157886505127, + "rewards/reward_fn/std": 0.269808292388916, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 239.1875, + "completions/mean_terminated_length": 239.1875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.021746048583854885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.02556429200922139, + "learning_rate": 7.9184e-06, + "loss": 0.001, + "num_tokens": 9215099.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.021852126869629788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.038756676425691694, + "learning_rate": 7.918e-06, + "loss": 0.0016, + "num_tokens": 9273627.0, + "reward": 3.86188006401062, + "reward_std": 0.4663735628128052, + "rewards/reward_fn/mean": 3.86188006401062, + "rewards/reward_fn/std": 0.4663735628128052, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 214.03125, + "completions/mean_terminated_length": 214.03125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.021958205155404687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.04700932162813842, + "learning_rate": 7.9176e-06, + "loss": 0.0019, + "num_tokens": 9310940.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 681.5625, + "completions/mean_terminated_length": 681.5625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.02206428344117959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.02947085179039277, + "learning_rate": 7.9172e-06, + "loss": 0.0012, + "num_tokens": 9364302.0, + "reward": 3.9197487831115723, + "reward_std": 0.3157848119735718, + "rewards/reward_fn/mean": 3.9197487831115723, + "rewards/reward_fn/std": 0.3157848119735718, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.022170361726954493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.03974838956492022, + "learning_rate": 7.9168e-06, + "loss": 0.0016, + "num_tokens": 9402146.0, + "reward": 3.14357852935791, + "reward_std": 0.41966521739959717, + "rewards/reward_fn/mean": 3.14357852935791, + "rewards/reward_fn/std": 0.41966521739959717, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 182.5625, + "completions/mean_terminated_length": 182.5625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.022276440012729395, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.049357162322849035, + "learning_rate": 7.9164e-06, + "loss": 0.002, + "num_tokens": 9444852.0, + "reward": 3.931201934814453, + "reward_std": 0.3891806900501251, + "rewards/reward_fn/mean": 3.931201934814453, + "rewards/reward_fn/std": 0.3891806900501251, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 88.96875, + "completions/mean_terminated_length": 88.96875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.022382518298504295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.224609375, + "kl": 0.06599157059099525, + "learning_rate": 7.916e-06, + "loss": 0.0026, + "num_tokens": 9485139.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 152.0625, + "completions/mean_terminated_length": 152.0625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.022488596584279198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.0660236410330981, + "learning_rate": 7.9156e-06, + "loss": 0.0026, + "num_tokens": 9525557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 208.0625, + "completions/mean_terminated_length": 208.0625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.0225946748700541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.059810824575833976, + "learning_rate": 7.9152e-06, + "loss": 0.0024, + "num_tokens": 9568375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 476.15625, + "completions/mean_terminated_length": 476.15625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.022700753155829003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.028504444286227226, + "learning_rate": 7.9148e-06, + "loss": 0.0011, + "num_tokens": 9618044.0, + "reward": 2.8191311359405518, + "reward_std": 0.31098735332489014, + "rewards/reward_fn/mean": 2.8191311359405518, + "rewards/reward_fn/std": 0.31098735332489014, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 227.25, + "completions/mean_terminated_length": 227.25, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.022806831441603902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.026997349661542103, + "learning_rate": 7.9144e-06, + "loss": 0.0011, + "num_tokens": 9663748.0, + "reward": 3.4124526977539062, + "reward_std": 0.1661146730184555, + "rewards/reward_fn/mean": 3.4124526977539062, + "rewards/reward_fn/std": 0.1661146730184555, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 234.375, + "completions/mean_terminated_length": 234.375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.022912909727378805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.04880005633458495, + "learning_rate": 7.913999999999999e-06, + "loss": 0.002, + "num_tokens": 9709584.0, + "reward": 3.9680237770080566, + "reward_std": 0.18088558316230774, + "rewards/reward_fn/mean": 3.9680237770080566, + "rewards/reward_fn/std": 0.18088559806346893, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 264.65625, + "completions/mean_terminated_length": 264.65625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.023018988013153708, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.051924208470154554, + "learning_rate": 7.9136e-06, + "loss": 0.0021, + "num_tokens": 9753861.0, + "reward": 1.794126033782959, + "reward_std": 0.02186727151274681, + "rewards/reward_fn/mean": 1.794126033782959, + "rewards/reward_fn/std": 0.02186727523803711, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 223.8125, + "completions/mean_terminated_length": 223.8125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.02312506629892861, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.041035423579160124, + "learning_rate": 7.913199999999999e-06, + "loss": 0.0016, + "num_tokens": 9802303.0, + "reward": 2.8291447162628174, + "reward_std": 0.024067319929599762, + "rewards/reward_fn/mean": 2.8291447162628174, + "rewards/reward_fn/std": 0.024067312479019165, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 489.6875, + "completions/mean_terminated_length": 489.6875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.02323114458470351, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03248375124530867, + "learning_rate": 7.9128e-06, + "loss": 0.0013, + "num_tokens": 9868373.0, + "reward": 3.9269018173217773, + "reward_std": 0.41350504755973816, + "rewards/reward_fn/mean": 3.9269018173217773, + "rewards/reward_fn/std": 0.4135049879550934, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.023337222870478413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.03564566490240395, + "learning_rate": 7.912399999999999e-06, + "loss": 0.0014, + "num_tokens": 9908272.0, + "reward": 2.8743367195129395, + "reward_std": 0.010945392772555351, + "rewards/reward_fn/mean": 2.8743367195129395, + "rewards/reward_fn/std": 0.01094542071223259, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 307.9375, + "completions/mean_terminated_length": 307.9375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.023443301156253316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.049371514352969825, + "learning_rate": 7.912e-06, + "loss": 0.002, + "num_tokens": 9955278.0, + "reward": 2.8059816360473633, + "reward_std": 0.21458326280117035, + "rewards/reward_fn/mean": 2.8059816360473633, + "rewards/reward_fn/std": 0.21458324790000916, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.02354937944202822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.04636182641843334, + "learning_rate": 7.911599999999999e-06, + "loss": 0.0019, + "num_tokens": 9993603.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 215.09375, + "completions/mean_terminated_length": 215.09375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.023655457727803118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.037611712352372706, + "learning_rate": 7.9112e-06, + "loss": 0.0015, + "num_tokens": 10035526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 404.1875, + "completions/mean_terminated_length": 404.1875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.02376153601357802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.031055110739544034, + "learning_rate": 7.910799999999999e-06, + "loss": 0.0012, + "num_tokens": 10083276.0, + "reward": 3.8559327125549316, + "reward_std": 0.3414000868797302, + "rewards/reward_fn/mean": 3.8559327125549316, + "rewards/reward_fn/std": 0.34140002727508545, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 178.90625, + "completions/mean_terminated_length": 178.90625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.023867614299352923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.045679540548007935, + "learning_rate": 7.9104e-06, + "loss": 0.0018, + "num_tokens": 10124553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 390.875, + "completions/mean_terminated_length": 390.875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.023973692585127826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04144083388382569, + "learning_rate": 7.91e-06, + "loss": 0.0017, + "num_tokens": 10200741.0, + "reward": 2.8919546604156494, + "reward_std": 0.07341831922531128, + "rewards/reward_fn/mean": 2.8919546604156494, + "rewards/reward_fn/std": 0.07341834157705307, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 281.0, + "completions/mean_terminated_length": 281.0, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.024079770870902725, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.041321132448501885, + "learning_rate": 7.9096e-06, + "loss": 0.0016, + "num_tokens": 10241029.0, + "reward": 3.0374879837036133, + "reward_std": 0.026088356971740723, + "rewards/reward_fn/mean": 3.0374879837036133, + "rewards/reward_fn/std": 0.02608831785619259, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 259.5, + "completions/mean_terminated_length": 259.5, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.024185849156677628, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.05707435368094593, + "learning_rate": 7.9092e-06, + "loss": 0.0023, + "num_tokens": 10278837.0, + "reward": 2.99910569190979, + "reward_std": 0.06235523894429207, + "rewards/reward_fn/mean": 2.99910569190979, + "rewards/reward_fn/std": 0.062355220317840576, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 147.90625, + "completions/mean_terminated_length": 147.90625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.02429192744245253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.055055871489457786, + "learning_rate": 7.9088e-06, + "loss": 0.0022, + "num_tokens": 10311794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 105.6875, + "completions/mean_terminated_length": 105.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.02439800572822743, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.06182589090894908, + "learning_rate": 7.9084e-06, + "loss": 0.0025, + "num_tokens": 10361128.0, + "reward": 3.989091396331787, + "reward_std": 0.061708446592092514, + "rewards/reward_fn/mean": 3.989091396331787, + "rewards/reward_fn/std": 0.061708465218544006, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 89.34375, + "completions/mean_terminated_length": 89.34375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.024504084014002333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35546875, + "kl": 0.0609287271508947, + "learning_rate": 7.908e-06, + "loss": 0.0024, + "num_tokens": 10404307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 374.75, + "completions/mean_terminated_length": 374.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.024610162299777236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.045864680025260895, + "learning_rate": 7.9076e-06, + "loss": 0.0018, + "num_tokens": 10432651.0, + "reward": 2.8449668884277344, + "reward_std": 0.02234821394085884, + "rewards/reward_fn/mean": 2.8449668884277344, + "rewards/reward_fn/std": 0.022348226979374886, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 175.0, + "completions/mean_terminated_length": 175.0, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.02471624058555214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.0719969633501023, + "learning_rate": 7.9072e-06, + "loss": 0.0029, + "num_tokens": 10478219.0, + "reward": 3.701000690460205, + "reward_std": 0.48612549901008606, + "rewards/reward_fn/mean": 3.701000690460205, + "rewards/reward_fn/std": 0.4861254394054413, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 95.28125, + "completions/mean_terminated_length": 95.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.024822318871327038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.0665056451689452, + "learning_rate": 7.906799999999999e-06, + "loss": 0.0027, + "num_tokens": 10514356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 187.0625, + "completions/mean_terminated_length": 187.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.02492839715710194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.05317691981326789, + "learning_rate": 7.9064e-06, + "loss": 0.0021, + "num_tokens": 10558326.0, + "reward": 3.9692771434783936, + "reward_std": 0.1737947314977646, + "rewards/reward_fn/mean": 3.9692771434783936, + "rewards/reward_fn/std": 0.17379476130008698, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 196.53125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.025034475442876843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.046237326750997454, + "learning_rate": 7.905999999999999e-06, + "loss": 0.0018, + "num_tokens": 10611559.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 143.09375, + "completions/mean_terminated_length": 143.09375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.025140553728651746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.07731026317924261, + "learning_rate": 7.9056e-06, + "loss": 0.0031, + "num_tokens": 10636874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 153.9375, + "completions/mean_terminated_length": 153.9375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.025246632014426645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.0432970619876869, + "learning_rate": 7.9052e-06, + "loss": 0.0017, + "num_tokens": 10695240.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 473.0625, + "completions/mean_terminated_length": 368.0666809082031, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.025352710300201548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.04987858441018034, + "learning_rate": 7.9048e-06, + "loss": 0.002, + "num_tokens": 10725962.0, + "reward": 1.7599223852157593, + "reward_std": 0.5935760736465454, + "rewards/reward_fn/mean": 1.7599223852157593, + "rewards/reward_fn/std": 0.5935760140419006, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 563.125, + "completions/mean_terminated_length": 563.125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.02545878858597645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.038596726139076054, + "learning_rate": 7.9044e-06, + "loss": 0.0015, + "num_tokens": 10788334.0, + "reward": 2.9869730472564697, + "reward_std": 0.33395835757255554, + "rewards/reward_fn/mean": 2.9869730472564697, + "rewards/reward_fn/std": 0.33395832777023315, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 202.4375, + "completions/mean_terminated_length": 202.4375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.025564866871751354, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.06359338853508234, + "learning_rate": 7.904e-06, + "loss": 0.0025, + "num_tokens": 10833500.0, + "reward": 2.872314214706421, + "reward_std": 0.05506020411849022, + "rewards/reward_fn/mean": 2.872314214706421, + "rewards/reward_fn/std": 0.055060189217329025, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 181.0625, + "completions/mean_terminated_length": 181.0625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.025670945157526253, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.05014959839172661, + "learning_rate": 7.9036e-06, + "loss": 0.002, + "num_tokens": 10874910.0, + "reward": 2.904085636138916, + "reward_std": 0.03150341659784317, + "rewards/reward_fn/mean": 2.904085636138916, + "rewards/reward_fn/std": 0.03150341659784317, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 186.46875, + "completions/mean_terminated_length": 186.46875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.025777023443301156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.04699235921725631, + "learning_rate": 7.903199999999999e-06, + "loss": 0.0019, + "num_tokens": 10896013.0, + "reward": 3.967304229736328, + "reward_std": 0.18495480716228485, + "rewards/reward_fn/mean": 3.967304229736328, + "rewards/reward_fn/std": 0.18495479226112366, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 172.1875, + "completions/mean_terminated_length": 172.1875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.02588310172907606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.06149169441778213, + "learning_rate": 7.9028e-06, + "loss": 0.0025, + "num_tokens": 10939251.0, + "reward": 3.968230724334717, + "reward_std": 0.17971432209014893, + "rewards/reward_fn/mean": 3.968230724334717, + "rewards/reward_fn/std": 0.17971433699131012, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 95.28125, + "completions/mean_terminated_length": 95.28125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.02598918001485096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.181640625, + "kl": 0.07846818270627409, + "learning_rate": 7.902399999999999e-06, + "loss": 0.0031, + "num_tokens": 10971612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.02609525830062586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.06347244174685329, + "learning_rate": 7.902e-06, + "loss": 0.0025, + "num_tokens": 11019918.0, + "reward": 2.806910991668701, + "reward_std": 0.19936612248420715, + "rewards/reward_fn/mean": 2.806910991668701, + "rewards/reward_fn/std": 0.19936615228652954, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 230.21875, + "completions/mean_terminated_length": 230.21875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.026201336586400763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.06723654980305582, + "learning_rate": 7.901599999999999e-06, + "loss": 0.0027, + "num_tokens": 11062581.0, + "reward": 3.2223823070526123, + "reward_std": 0.4942571222782135, + "rewards/reward_fn/mean": 3.2223823070526123, + "rewards/reward_fn/std": 0.4942571222782135, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 92.15625, + "completions/mean_terminated_length": 92.15625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.026307414872175666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2041015625, + "kl": 0.08666167384944856, + "learning_rate": 7.9012e-06, + "loss": 0.0035, + "num_tokens": 11082490.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 254.59375, + "completions/mean_terminated_length": 254.59375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.02641349315795057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.047324330371338874, + "learning_rate": 7.9008e-06, + "loss": 0.0019, + "num_tokens": 11123149.0, + "reward": 2.9275622367858887, + "reward_std": 0.01993408240377903, + "rewards/reward_fn/mean": 2.9275622367858887, + "rewards/reward_fn/std": 0.019934087991714478, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 388.625, + "completions/mean_terminated_length": 388.625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.02651957144372547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.03459140588529408, + "learning_rate": 7.9004e-06, + "loss": 0.0014, + "num_tokens": 11172769.0, + "reward": 2.702910900115967, + "reward_std": 0.029907824471592903, + "rewards/reward_fn/mean": 2.702910900115967, + "rewards/reward_fn/std": 0.0299078281968832, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 181.875, + "completions/mean_terminated_length": 181.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.02662564972950037, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.04598871496273205, + "learning_rate": 7.9e-06, + "loss": 0.0018, + "num_tokens": 11196861.0, + "reward": 3.9364213943481445, + "reward_std": 0.25018107891082764, + "rewards/reward_fn/mean": 3.9364213943481445, + "rewards/reward_fn/std": 0.25018110871315, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 186.8125, + "completions/mean_terminated_length": 186.8125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.026731728015275274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.06444371730322018, + "learning_rate": 7.8996e-06, + "loss": 0.0026, + "num_tokens": 11243735.0, + "reward": 3.9610886573791504, + "reward_std": 0.220115527510643, + "rewards/reward_fn/mean": 3.9610886573791504, + "rewards/reward_fn/std": 0.2201155424118042, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 223.28125, + "completions/mean_terminated_length": 223.28125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.026837806301050177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.04907032381743193, + "learning_rate": 7.8992e-06, + "loss": 0.002, + "num_tokens": 11287904.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 420.6875, + "completions/mean_terminated_length": 420.6875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.026943884586825076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.048126887530088425, + "learning_rate": 7.8988e-06, + "loss": 0.0019, + "num_tokens": 11333430.0, + "reward": 2.9162116050720215, + "reward_std": 0.41033974289894104, + "rewards/reward_fn/mean": 2.9162116050720215, + "rewards/reward_fn/std": 0.41033968329429626, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 149.65625, + "completions/mean_terminated_length": 149.65625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.02704996287259998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.240234375, + "kl": 0.08926701568998396, + "learning_rate": 7.898399999999999e-06, + "loss": 0.0036, + "num_tokens": 11368939.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 227.21875, + "completions/mean_terminated_length": 227.21875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.02715604115837488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.05576099781319499, + "learning_rate": 7.898e-06, + "loss": 0.0022, + "num_tokens": 11424722.0, + "reward": 3.6330835819244385, + "reward_std": 0.7868334054946899, + "rewards/reward_fn/mean": 3.6330835819244385, + "rewards/reward_fn/std": 0.7868334054946899, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 113.71875, + "completions/mean_terminated_length": 113.71875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.027262119444149784, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "kl": 0.09824320953339338, + "learning_rate": 7.897599999999999e-06, + "loss": 0.0039, + "num_tokens": 11456265.0, + "reward": 3.084028720855713, + "reward_std": 0.049738768488168716, + "rewards/reward_fn/mean": 3.084028720855713, + "rewards/reward_fn/std": 0.04973877593874931, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 204.78125, + "completions/mean_terminated_length": 204.78125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.027368197729924684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06405076989904046, + "learning_rate": 7.8972e-06, + "loss": 0.0026, + "num_tokens": 11495842.0, + "reward": 3.972503185272217, + "reward_std": 0.15554513037204742, + "rewards/reward_fn/mean": 3.972503185272217, + "rewards/reward_fn/std": 0.15554508566856384, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 165.0625, + "completions/mean_terminated_length": 165.0625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.027474276015699586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.058812946430407465, + "learning_rate": 7.896799999999999e-06, + "loss": 0.0024, + "num_tokens": 11531428.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 192.34375, + "completions/mean_terminated_length": 192.34375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.02758035430147449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3671875, + "kl": 0.07722758501768112, + "learning_rate": 7.8964e-06, + "loss": 0.0031, + "num_tokens": 11570095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 174.71875, + "completions/mean_terminated_length": 174.71875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.02768643258724939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.06501786492299289, + "learning_rate": 7.896e-06, + "loss": 0.0026, + "num_tokens": 11612422.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 163.75, + "completions/mean_terminated_length": 163.75, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.02779251087302429, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.08121591794770211, + "learning_rate": 7.8956e-06, + "loss": 0.0033, + "num_tokens": 11647198.0, + "reward": 2.8717610836029053, + "reward_std": 0.009377574548125267, + "rewards/reward_fn/mean": 2.8717610836029053, + "rewards/reward_fn/std": 0.009377571754157543, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 192.53125, + "completions/mean_terminated_length": 192.53125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.027898589158799194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2314453125, + "kl": 0.05851406557485461, + "learning_rate": 7.8952e-06, + "loss": 0.0023, + "num_tokens": 11691119.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 303.59375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.028004667444574097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.060931759886443615, + "learning_rate": 7.8948e-06, + "loss": 0.0024, + "num_tokens": 11730018.0, + "reward": 2.826643466949463, + "reward_std": 0.01925027370452881, + "rewards/reward_fn/mean": 2.826643466949463, + "rewards/reward_fn/std": 0.019250305369496346, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.028110745730348996, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.05281880928669125, + "learning_rate": 7.8944e-06, + "loss": 0.0021, + "num_tokens": 11773342.0, + "reward": 2.93717885017395, + "reward_std": 0.022945858538150787, + "rewards/reward_fn/mean": 2.93717885017395, + "rewards/reward_fn/std": 0.02294587530195713, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 98.5, + "completions/mean_terminated_length": 98.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.0282168240161239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.251953125, + "kl": 0.12909941375255585, + "learning_rate": 7.894e-06, + "loss": 0.0052, + "num_tokens": 11814926.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 176.9375, + "completions/mean_terminated_length": 176.9375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0283229023018988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.05416906625032425, + "learning_rate": 7.8936e-06, + "loss": 0.0022, + "num_tokens": 11853292.0, + "reward": 1.8326301574707031, + "reward_std": 0.24207736551761627, + "rewards/reward_fn/mean": 1.8326301574707031, + "rewards/reward_fn/std": 0.24207736551761627, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 143.75, + "completions/mean_terminated_length": 143.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.028428980587673704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "kl": 0.04807147482642904, + "learning_rate": 7.8932e-06, + "loss": 0.0019, + "num_tokens": 11891940.0, + "reward": 3.042628526687622, + "reward_std": 0.03994278982281685, + "rewards/reward_fn/mean": 3.042628526687622, + "rewards/reward_fn/std": 0.03994282707571983, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 171.25, + "completions/mean_terminated_length": 171.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.028535058873448604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.07457437110133469, + "learning_rate": 7.8928e-06, + "loss": 0.003, + "num_tokens": 11967852.0, + "reward": 3.9629762172698975, + "reward_std": 0.2094382643699646, + "rewards/reward_fn/mean": 3.9629762172698975, + "rewards/reward_fn/std": 0.2094382643699646, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 144.03125, + "completions/mean_terminated_length": 144.03125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.028641137159223506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.04853607731638476, + "learning_rate": 7.8924e-06, + "loss": 0.0019, + "num_tokens": 12022797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 87.28125, + "completions/mean_terminated_length": 87.28125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.02874721544499841, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09375, + "kl": 0.07282675942406058, + "learning_rate": 7.892e-06, + "loss": 0.0029, + "num_tokens": 12060790.0, + "reward": 3.9492380619049072, + "reward_std": 0.13654832541942596, + "rewards/reward_fn/mean": 3.9492380619049072, + "rewards/reward_fn/std": 0.1365482658147812, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 136.15625, + "completions/mean_terminated_length": 136.15625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.028853293730773312, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.09152523137163371, + "learning_rate": 7.8916e-06, + "loss": 0.0037, + "num_tokens": 12106555.0, + "reward": 3.120361804962158, + "reward_std": 0.32416626811027527, + "rewards/reward_fn/mean": 3.120361804962158, + "rewards/reward_fn/std": 0.32416629791259766, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 144.0625, + "completions/mean_terminated_length": 144.0625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.02895937201654821, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.28125, + "kl": 0.05412678746506572, + "learning_rate": 7.8912e-06, + "loss": 0.0022, + "num_tokens": 12142813.0, + "reward": 2.741058111190796, + "reward_std": 0.016797101125121117, + "rewards/reward_fn/mean": 2.741058111190796, + "rewards/reward_fn/std": 0.016797110438346863, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.029065450302323114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.04939495923463255, + "learning_rate": 7.890799999999999e-06, + "loss": 0.002, + "num_tokens": 12176988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 304.28125, + "completions/mean_terminated_length": 304.28125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.029171528588098017, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.06299524009227753, + "learning_rate": 7.8904e-06, + "loss": 0.0025, + "num_tokens": 12221829.0, + "reward": 2.059215545654297, + "reward_std": 0.4553739130496979, + "rewards/reward_fn/mean": 2.059215545654297, + "rewards/reward_fn/std": 0.4553739130496979, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 76.28125, + "completions/mean_terminated_length": 76.28125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.02927760687387292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.197265625, + "kl": 0.031393807439599186, + "learning_rate": 7.889999999999999e-06, + "loss": 0.0013, + "num_tokens": 12261806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.02938368515964782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.034186359436716884, + "learning_rate": 7.8896e-06, + "loss": 0.0014, + "num_tokens": 12309018.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 104.46875, + "completions/mean_terminated_length": 104.46875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.02948976344542272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.042672890005633235, + "learning_rate": 7.889199999999999e-06, + "loss": 0.0017, + "num_tokens": 12343017.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 128.78125, + "completions/mean_terminated_length": 128.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.029595841731197624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.06968200847040862, + "learning_rate": 7.8888e-06, + "loss": 0.0028, + "num_tokens": 12375650.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 320.46875, + "completions/mean_terminated_length": 320.46875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.029701920016972527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.06288569653406739, + "learning_rate": 7.888399999999999e-06, + "loss": 0.0025, + "num_tokens": 12437777.0, + "reward": 3.931251049041748, + "reward_std": 0.27083924412727356, + "rewards/reward_fn/mean": 3.931251049041748, + "rewards/reward_fn/std": 0.27083921432495117, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 115.71875, + "completions/mean_terminated_length": 115.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.029807998302747427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.06481995538342744, + "learning_rate": 7.888e-06, + "loss": 0.0026, + "num_tokens": 12473992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 166.78125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.02991407658852233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.05803050857502967, + "learning_rate": 7.887599999999999e-06, + "loss": 0.0023, + "num_tokens": 12496545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 531.6875, + "completions/mean_terminated_length": 531.6875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.030020154874297232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.06650032801553607, + "learning_rate": 7.8872e-06, + "loss": 0.0027, + "num_tokens": 12548439.0, + "reward": 3.1316885948181152, + "reward_std": 0.381794273853302, + "rewards/reward_fn/mean": 3.1316885948181152, + "rewards/reward_fn/std": 0.3817942142486572, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 257.53125, + "completions/mean_terminated_length": 257.53125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.030126233160072135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.05967911321204156, + "learning_rate": 7.8868e-06, + "loss": 0.0024, + "num_tokens": 12591048.0, + "reward": 2.9689855575561523, + "reward_std": 0.0832633227109909, + "rewards/reward_fn/mean": 2.9689855575561523, + "rewards/reward_fn/std": 0.0832633450627327, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 83.1875, + "completions/mean_terminated_length": 83.1875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.030232311445847034, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "kl": 0.044598213862627745, + "learning_rate": 7.8864e-06, + "loss": 0.0018, + "num_tokens": 12621134.0, + "reward": 3.341036319732666, + "reward_std": 0.005204085260629654, + "rewards/reward_fn/mean": 3.341036319732666, + "rewards/reward_fn/std": 0.0052041225135326385, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.030338389731621937, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.05357580480631441, + "learning_rate": 7.886e-06, + "loss": 0.0021, + "num_tokens": 12655343.0, + "reward": 3.5632970333099365, + "reward_std": 0.5730718970298767, + "rewards/reward_fn/mean": 3.5632970333099365, + "rewards/reward_fn/std": 0.5730718970298767, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 281.59375, + "completions/mean_terminated_length": 281.59375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.03044446801739684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.05916093214182183, + "learning_rate": 7.8856e-06, + "loss": 0.0024, + "num_tokens": 12701762.0, + "reward": 2.9577698707580566, + "reward_std": 0.07138428837060928, + "rewards/reward_fn/mean": 2.9577698707580566, + "rewards/reward_fn/std": 0.07138428092002869, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 355.40625, + "completions/mean_terminated_length": 355.40625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.03055054630317174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.0407480897847563, + "learning_rate": 7.8852e-06, + "loss": 0.0016, + "num_tokens": 12757071.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 425.28125, + "completions/mean_terminated_length": 425.28125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.030656624588946642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.03841312974691391, + "learning_rate": 7.8848e-06, + "loss": 0.0015, + "num_tokens": 12805368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 426.6875, + "completions/mean_terminated_length": 426.6875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.030762702874721545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.06251498265191913, + "learning_rate": 7.8844e-06, + "loss": 0.0025, + "num_tokens": 12865678.0, + "reward": 2.680443525314331, + "reward_std": 0.1847320795059204, + "rewards/reward_fn/mean": 2.680443525314331, + "rewards/reward_fn/std": 0.18473204970359802, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 319.84375, + "completions/mean_terminated_length": 319.84375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.030868781160496447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.039175089448690414, + "learning_rate": 7.884e-06, + "loss": 0.0016, + "num_tokens": 12890473.0, + "reward": 3.614109992980957, + "reward_std": 0.6846138834953308, + "rewards/reward_fn/mean": 3.614109992980957, + "rewards/reward_fn/std": 0.6846139430999756, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 346.28125, + "completions/mean_terminated_length": 346.28125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.030974859446271347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.05797181086381897, + "learning_rate": 7.8836e-06, + "loss": 0.0023, + "num_tokens": 12932338.0, + "reward": 3.0171799659729004, + "reward_std": 0.024844994768500328, + "rewards/reward_fn/mean": 3.0171799659729004, + "rewards/reward_fn/std": 0.024844978004693985, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 307.1875, + "completions/mean_terminated_length": 307.1875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.03108093773204625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.05411390570225194, + "learning_rate": 7.8832e-06, + "loss": 0.0022, + "num_tokens": 12981112.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 172.84375, + "completions/mean_terminated_length": 172.84375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.031187016017821152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.04656997602432966, + "learning_rate": 7.882799999999998e-06, + "loss": 0.0019, + "num_tokens": 13030483.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 143.28125, + "completions/mean_terminated_length": 143.28125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.03129309430359605, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.09696014143992215, + "learning_rate": 7.8824e-06, + "loss": 0.0039, + "num_tokens": 13072348.0, + "reward": 3.930624008178711, + "reward_std": 0.27306249737739563, + "rewards/reward_fn/mean": 3.930624008178711, + "rewards/reward_fn/std": 0.273062527179718, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 91.84375, + "completions/mean_terminated_length": 91.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.03139917258937096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1904296875, + "kl": 0.05550631647929549, + "learning_rate": 7.882e-06, + "loss": 0.0022, + "num_tokens": 13092599.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 190.75, + "completions/mean_terminated_length": 190.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.03150525087514586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.06322728469967842, + "learning_rate": 7.8816e-06, + "loss": 0.0025, + "num_tokens": 13138255.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 209.21875, + "completions/mean_terminated_length": 209.21875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.031611329160920756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.04859055450651795, + "learning_rate": 7.8812e-06, + "loss": 0.0019, + "num_tokens": 13170902.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1606.0, + "completions/mean_length": 941.46875, + "completions/mean_terminated_length": 905.774169921875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.03171740744669566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.02767645107815042, + "learning_rate": 7.880799999999999e-06, + "loss": 0.0011, + "num_tokens": 13251205.0, + "reward": 2.9043970108032227, + "reward_std": 0.6032812595367432, + "rewards/reward_fn/mean": 2.9043970108032227, + "rewards/reward_fn/std": 0.6032813191413879, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 227.0625, + "completions/mean_terminated_length": 227.0625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.03182348573247056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.08030975563451648, + "learning_rate": 7.8804e-06, + "loss": 0.0032, + "num_tokens": 13288967.0, + "reward": 3.018399715423584, + "reward_std": 0.02766135148704052, + "rewards/reward_fn/mean": 3.018399715423584, + "rewards/reward_fn/std": 0.027661342173814774, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 628.0, + "completions/mean_terminated_length": 628.0, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.03192956401824547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04724831320345402, + "learning_rate": 7.879999999999999e-06, + "loss": 0.0019, + "num_tokens": 13358855.0, + "reward": 2.7298004627227783, + "reward_std": 0.19260305166244507, + "rewards/reward_fn/mean": 2.7298004627227783, + "rewards/reward_fn/std": 0.19260311126708984, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.03203564230402037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11572265625, + "kl": 0.07541715656407177, + "learning_rate": 7.8796e-06, + "loss": 0.003, + "num_tokens": 13418479.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 202.59375, + "completions/mean_terminated_length": 202.59375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.03214172058979527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06993928935844451, + "learning_rate": 7.879199999999999e-06, + "loss": 0.0028, + "num_tokens": 13465890.0, + "reward": 3.0506973266601562, + "reward_std": 0.8426318764686584, + "rewards/reward_fn/mean": 3.0506973266601562, + "rewards/reward_fn/std": 0.8426318764686584, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 225.71875, + "completions/mean_terminated_length": 225.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.03224779887557017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.06964913755655289, + "learning_rate": 7.8788e-06, + "loss": 0.0028, + "num_tokens": 13512537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 281.84375, + "completions/mean_terminated_length": 281.84375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.03235387716134507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.10228216869290918, + "learning_rate": 7.878399999999999e-06, + "loss": 0.0041, + "num_tokens": 13559316.0, + "reward": 3.088973045349121, + "reward_std": 0.3510621190071106, + "rewards/reward_fn/mean": 3.088973045349121, + "rewards/reward_fn/std": 0.3510621190071106, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 93.90625, + "completions/mean_terminated_length": 93.90625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.03245995544711997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1689453125, + "kl": 0.03925357403932139, + "learning_rate": 7.878e-06, + "loss": 0.0016, + "num_tokens": 13574705.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 164.40625, + "completions/mean_terminated_length": 164.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.03256603373289488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1513671875, + "kl": 0.09308067942038178, + "learning_rate": 7.8776e-06, + "loss": 0.0037, + "num_tokens": 13617726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 242.59375, + "completions/mean_terminated_length": 242.59375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.03267211201866978, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04490488808369264, + "learning_rate": 7.8772e-06, + "loss": 0.0018, + "num_tokens": 13661489.0, + "reward": 2.939380645751953, + "reward_std": 0.02233259007334709, + "rewards/reward_fn/mean": 2.939380645751953, + "rewards/reward_fn/std": 0.02233261801302433, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 168.0625, + "completions/mean_terminated_length": 168.0625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.03277819030444468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.07158257730770856, + "learning_rate": 7.8768e-06, + "loss": 0.0029, + "num_tokens": 13700787.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 201.03125, + "completions/mean_terminated_length": 201.03125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.03288426859021958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.06688284839037806, + "learning_rate": 7.8764e-06, + "loss": 0.0027, + "num_tokens": 13742836.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 259.875, + "completions/mean_terminated_length": 259.875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.03299034687599448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.04511914274189621, + "learning_rate": 7.876e-06, + "loss": 0.0018, + "num_tokens": 13783056.0, + "reward": 3.962053060531616, + "reward_std": 0.21466024219989777, + "rewards/reward_fn/mean": 3.962053060531616, + "rewards/reward_fn/std": 0.21466021239757538, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 97.15625, + "completions/mean_terminated_length": 97.15625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.03309642516176939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.04876602778676897, + "learning_rate": 7.8756e-06, + "loss": 0.002, + "num_tokens": 13826933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 251.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.03320250344754429, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.08308691228739917, + "learning_rate": 7.8752e-06, + "loss": 0.0033, + "num_tokens": 13895529.0, + "reward": 3.067251682281494, + "reward_std": 0.03814023733139038, + "rewards/reward_fn/mean": 3.067251682281494, + "rewards/reward_fn/std": 0.03814024478197098, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 228.34375, + "completions/mean_terminated_length": 228.34375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.03330858173331919, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.10598539758939296, + "learning_rate": 7.8748e-06, + "loss": 0.0042, + "num_tokens": 13941396.0, + "reward": 3.958070755004883, + "reward_std": 0.2371879369020462, + "rewards/reward_fn/mean": 3.958070755004883, + "rewards/reward_fn/std": 0.237187922000885, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 335.8125, + "completions/mean_terminated_length": 335.8125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.03341466001909409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.04709806991741061, + "learning_rate": 7.874399999999999e-06, + "loss": 0.0019, + "num_tokens": 13964750.0, + "reward": 3.926464080810547, + "reward_std": 0.41598135232925415, + "rewards/reward_fn/mean": 3.926464080810547, + "rewards/reward_fn/std": 0.41598138213157654, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 684.34375, + "completions/mean_terminated_length": 489.5357360839844, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.03352073830486899, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.04148575963336043, + "learning_rate": 7.874e-06, + "loss": 0.0017, + "num_tokens": 14021081.0, + "reward": 2.240612030029297, + "reward_std": 1.1568071842193604, + "rewards/reward_fn/mean": 2.240612030029297, + "rewards/reward_fn/std": 1.1568071842193604, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 333.78125, + "completions/mean_terminated_length": 333.78125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.03362681659064389, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05524576886091381, + "learning_rate": 7.873599999999999e-06, + "loss": 0.0022, + "num_tokens": 14066962.0, + "reward": 3.1448464393615723, + "reward_std": 0.28363174200057983, + "rewards/reward_fn/mean": 3.1448464393615723, + "rewards/reward_fn/std": 0.2836317718029022, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1069.0, + "completions/max_terminated_length": 1069.0, + "completions/mean_length": 353.1875, + "completions/mean_terminated_length": 353.1875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.0337328948764188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.06478465755935758, + "learning_rate": 7.8732e-06, + "loss": 0.0026, + "num_tokens": 14109848.0, + "reward": 2.9029226303100586, + "reward_std": 0.061471011489629745, + "rewards/reward_fn/mean": 2.9029226303100586, + "rewards/reward_fn/std": 0.06147094815969467, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0338389731621937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.07620589598082006, + "learning_rate": 7.8728e-06, + "loss": 0.003, + "num_tokens": 14151164.0, + "reward": 2.853126049041748, + "reward_std": 0.06743326038122177, + "rewards/reward_fn/mean": 2.853126049041748, + "rewards/reward_fn/std": 0.06743327528238297, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 236.96875, + "completions/mean_terminated_length": 236.96875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.033945051447968604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.08050445187836885, + "learning_rate": 7.8724e-06, + "loss": 0.0032, + "num_tokens": 14195995.0, + "reward": 2.7778611183166504, + "reward_std": 0.030127666890621185, + "rewards/reward_fn/mean": 2.7778611183166504, + "rewards/reward_fn/std": 0.030127670615911484, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 194.78125, + "completions/mean_terminated_length": 194.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.0340511297337435, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.08380766562186182, + "learning_rate": 7.872e-06, + "loss": 0.0034, + "num_tokens": 14251092.0, + "reward": 3.6946752071380615, + "reward_std": 0.21935689449310303, + "rewards/reward_fn/mean": 3.6946752071380615, + "rewards/reward_fn/std": 0.21935686469078064, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.0341572080195184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.09025908191688359, + "learning_rate": 7.8716e-06, + "loss": 0.0036, + "num_tokens": 14304740.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 117.21875, + "completions/mean_terminated_length": 117.21875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.03426328630529331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1689453125, + "kl": 0.07025603024521843, + "learning_rate": 7.8712e-06, + "loss": 0.0028, + "num_tokens": 14329771.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 252.6875, + "completions/mean_terminated_length": 252.6875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.03436936459106821, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.079339619143866, + "learning_rate": 7.8708e-06, + "loss": 0.0032, + "num_tokens": 14379169.0, + "reward": 3.0271012783050537, + "reward_std": 0.025822646915912628, + "rewards/reward_fn/mean": 3.0271012783050537, + "rewards/reward_fn/std": 0.025822622701525688, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 248.125, + "completions/mean_terminated_length": 248.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.03447544287684311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.11712923878803849, + "learning_rate": 7.8704e-06, + "loss": 0.0047, + "num_tokens": 14423781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 229.0625, + "completions/mean_terminated_length": 229.0625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.03458152116261801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.06614450388588011, + "learning_rate": 7.87e-06, + "loss": 0.0026, + "num_tokens": 14464295.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 164.0, + "completions/mean_terminated_length": 164.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.03468759944839291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.06275707174791023, + "learning_rate": 7.8696e-06, + "loss": 0.0025, + "num_tokens": 14514919.0, + "reward": 3.0044941902160645, + "reward_std": 0.014064479619264603, + "rewards/reward_fn/mean": 3.0044941902160645, + "rewards/reward_fn/std": 0.014064503833651543, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 87.1875, + "completions/mean_terminated_length": 87.1875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.03479367773416782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.030889727699104697, + "learning_rate": 7.869199999999999e-06, + "loss": 0.0012, + "num_tokens": 14552333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 352.6875, + "completions/mean_terminated_length": 352.6875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.03489975601994272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.06849374365992844, + "learning_rate": 7.8688e-06, + "loss": 0.0027, + "num_tokens": 14593507.0, + "reward": 2.9629576206207275, + "reward_std": 0.034861400723457336, + "rewards/reward_fn/mean": 2.9629576206207275, + "rewards/reward_fn/std": 0.03486141189932823, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 195.09375, + "completions/mean_terminated_length": 195.09375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.03500583430571762, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.09164670249447227, + "learning_rate": 7.8684e-06, + "loss": 0.0037, + "num_tokens": 14638374.0, + "reward": 3.929513931274414, + "reward_std": 0.3987296223640442, + "rewards/reward_fn/mean": 3.929513931274414, + "rewards/reward_fn/std": 0.3987296521663666, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 150.8125, + "completions/mean_terminated_length": 150.8125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.035111912591492524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.08404018310829997, + "learning_rate": 7.868e-06, + "loss": 0.0034, + "num_tokens": 14675680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 194.46875, + "completions/mean_terminated_length": 194.46875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.03521799087726742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.09944120491854846, + "learning_rate": 7.8676e-06, + "loss": 0.004, + "num_tokens": 14715183.0, + "reward": 2.8881282806396484, + "reward_std": 0.014438438229262829, + "rewards/reward_fn/mean": 2.8881282806396484, + "rewards/reward_fn/std": 0.014438426122069359, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 578.6875, + "completions/mean_terminated_length": 578.6875, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.03532406916304232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04957582626957446, + "learning_rate": 7.8672e-06, + "loss": 0.002, + "num_tokens": 14777285.0, + "reward": 3.68550968170166, + "reward_std": 0.7617502808570862, + "rewards/reward_fn/mean": 3.68550968170166, + "rewards/reward_fn/std": 0.7617502212524414, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.03543014744881723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.051250798685941845, + "learning_rate": 7.866799999999999e-06, + "loss": 0.0021, + "num_tokens": 14824295.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.03553622573459213, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.07835527614224702, + "learning_rate": 7.8664e-06, + "loss": 0.0031, + "num_tokens": 14873447.0, + "reward": 3.213829517364502, + "reward_std": 0.2138700932264328, + "rewards/reward_fn/mean": 3.213829517364502, + "rewards/reward_fn/std": 0.2138700783252716, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 470.9375, + "completions/mean_terminated_length": 470.9375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.035642304020367034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.07522962475195527, + "learning_rate": 7.865999999999999e-06, + "loss": 0.003, + "num_tokens": 14934085.0, + "reward": 2.775710105895996, + "reward_std": 0.02678767405450344, + "rewards/reward_fn/mean": 2.775710105895996, + "rewards/reward_fn/std": 0.02678770385682583, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 217.28125, + "completions/mean_terminated_length": 217.28125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.03574838230614193, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.08199672133196145, + "learning_rate": 7.8656e-06, + "loss": 0.0033, + "num_tokens": 14970926.0, + "reward": 2.9818575382232666, + "reward_std": 0.03788159415125847, + "rewards/reward_fn/mean": 2.9818575382232666, + "rewards/reward_fn/std": 0.037881579250097275, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 127.6875, + "completions/mean_terminated_length": 127.6875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.03585446059191683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.10428278136532754, + "learning_rate": 7.865199999999999e-06, + "loss": 0.0042, + "num_tokens": 15020196.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.03596053887769174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.1081386434379965, + "learning_rate": 7.8648e-06, + "loss": 0.0043, + "num_tokens": 15065544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.03606661716346664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.059167297556996346, + "learning_rate": 7.864399999999999e-06, + "loss": 0.0024, + "num_tokens": 15111632.0, + "reward": 2.826021194458008, + "reward_std": 0.021319499239325523, + "rewards/reward_fn/mean": 2.826021194458008, + "rewards/reward_fn/std": 0.021319523453712463, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 210.375, + "completions/mean_terminated_length": 210.375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.03617269544924154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18359375, + "kl": 0.09864125947933644, + "learning_rate": 7.864e-06, + "loss": 0.0039, + "num_tokens": 15136988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 190.5, + "completions/mean_terminated_length": 190.5, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.036278773735016444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.07318210729863495, + "learning_rate": 7.8636e-06, + "loss": 0.0029, + "num_tokens": 15181260.0, + "reward": 3.665925979614258, + "reward_std": 0.5428962707519531, + "rewards/reward_fn/mean": 3.665925979614258, + "rewards/reward_fn/std": 0.5428962707519531, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 613.65625, + "completions/mean_terminated_length": 567.3870849609375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.03638485202079134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6328125, + "kl": 0.04777409916277975, + "learning_rate": 7.8632e-06, + "loss": 0.0019, + "num_tokens": 15241761.0, + "reward": 2.752007484436035, + "reward_std": 0.5035932064056396, + "rewards/reward_fn/mean": 2.752007484436035, + "rewards/reward_fn/std": 0.5035931468009949, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 160.34375, + "completions/mean_terminated_length": 160.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.03649093030656624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.07249265850987285, + "learning_rate": 7.8628e-06, + "loss": 0.0029, + "num_tokens": 15281484.0, + "reward": 3.5909247398376465, + "reward_std": 0.5742731690406799, + "rewards/reward_fn/mean": 3.5909247398376465, + "rewards/reward_fn/std": 0.5742731690406799, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 260.0, + "completions/mean_terminated_length": 260.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.03659700859234115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.05745165317784995, + "learning_rate": 7.8624e-06, + "loss": 0.0023, + "num_tokens": 15301420.0, + "reward": 3.959191083908081, + "reward_std": 0.2308497428894043, + "rewards/reward_fn/mean": 3.959191083908081, + "rewards/reward_fn/std": 0.23084969818592072, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 462.4375, + "completions/mean_terminated_length": 411.2903137207031, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.03670308687811605, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.05426511203404516, + "learning_rate": 7.862e-06, + "loss": 0.0022, + "num_tokens": 15368378.0, + "reward": 2.8959522247314453, + "reward_std": 0.5591188669204712, + "rewards/reward_fn/mean": 2.8959522247314453, + "rewards/reward_fn/std": 0.5591188669204712, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 143.9375, + "completions/mean_terminated_length": 143.9375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.036809165163890954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.07412998762447387, + "learning_rate": 7.8616e-06, + "loss": 0.003, + "num_tokens": 15406200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 171.6875, + "completions/mean_terminated_length": 171.6875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.036915243449665854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.06604230729863048, + "learning_rate": 7.8612e-06, + "loss": 0.0026, + "num_tokens": 15444398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.03702132173544075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.08114727039355785, + "learning_rate": 7.8608e-06, + "loss": 0.0032, + "num_tokens": 15485882.0, + "reward": 3.92812180519104, + "reward_std": 0.2829616963863373, + "rewards/reward_fn/mean": 3.92812180519104, + "rewards/reward_fn/std": 0.28296172618865967, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 428.0, + "completions/mean_terminated_length": 428.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.03712740002121566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.05857816792558879, + "learning_rate": 7.8604e-06, + "loss": 0.0023, + "num_tokens": 15530746.0, + "reward": 3.9291768074035645, + "reward_std": 0.400637149810791, + "rewards/reward_fn/mean": 3.9291768074035645, + "rewards/reward_fn/std": 0.400637149810791, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 223.25, + "completions/mean_terminated_length": 223.25, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.03723347830699056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.059653010219335556, + "learning_rate": 7.86e-06, + "loss": 0.0024, + "num_tokens": 15584002.0, + "reward": 3.0378000736236572, + "reward_std": 0.08900023251771927, + "rewards/reward_fn/mean": 3.0378000736236572, + "rewards/reward_fn/std": 0.08900019526481628, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 235.3125, + "completions/mean_terminated_length": 235.3125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.03733955659276546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.05214848497416824, + "learning_rate": 7.8596e-06, + "loss": 0.0021, + "num_tokens": 15645964.0, + "reward": 2.89939546585083, + "reward_std": 0.025253912433981895, + "rewards/reward_fn/mean": 2.89939546585083, + "rewards/reward_fn/std": 0.025253916159272194, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 337.71875, + "completions/mean_terminated_length": 337.71875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.037445634878540364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.07626998424530029, + "learning_rate": 7.8592e-06, + "loss": 0.0031, + "num_tokens": 15678659.0, + "reward": 3.342073440551758, + "reward_std": 0.8421680927276611, + "rewards/reward_fn/mean": 3.342073440551758, + "rewards/reward_fn/std": 0.8421680927276611, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 427.28125, + "completions/mean_terminated_length": 427.28125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.03755171316431526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.04384071903768927, + "learning_rate": 7.8588e-06, + "loss": 0.0018, + "num_tokens": 15734284.0, + "reward": 3.242161750793457, + "reward_std": 0.7822414636611938, + "rewards/reward_fn/mean": 3.242161750793457, + "rewards/reward_fn/std": 0.7822414040565491, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 111.21875, + "completions/mean_terminated_length": 111.21875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.03765779145009017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.02709541004151106, + "learning_rate": 7.8584e-06, + "loss": 0.0011, + "num_tokens": 15773587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 253.03125, + "completions/mean_terminated_length": 253.03125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.03776386973586507, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.05555059225298464, + "learning_rate": 7.858e-06, + "loss": 0.0022, + "num_tokens": 15821876.0, + "reward": 3.5505006313323975, + "reward_std": 0.5528962016105652, + "rewards/reward_fn/mean": 3.5505006313323975, + "rewards/reward_fn/std": 0.5528962016105652, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 190.5, + "completions/mean_terminated_length": 190.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.03786994802163997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.05794315051753074, + "learning_rate": 7.857599999999999e-06, + "loss": 0.0023, + "num_tokens": 15858660.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 179.6875, + "completions/mean_terminated_length": 179.6875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.037976026307414874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.06639315851498395, + "learning_rate": 7.8572e-06, + "loss": 0.0027, + "num_tokens": 15908570.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 550.25, + "completions/mean_terminated_length": 550.25, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.038082104593189774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.05704544053878635, + "learning_rate": 7.856799999999999e-06, + "loss": 0.0023, + "num_tokens": 15951650.0, + "reward": 2.769361972808838, + "reward_std": 0.1761067509651184, + "rewards/reward_fn/mean": 2.769361972808838, + "rewards/reward_fn/std": 0.1761067658662796, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 109.46875, + "completions/mean_terminated_length": 109.46875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.03818818287896467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.017623669904423878, + "learning_rate": 7.8564e-06, + "loss": 0.0007, + "num_tokens": 15981841.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 238.1875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.03829426116473958, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.07996979088056833, + "learning_rate": 7.855999999999999e-06, + "loss": 0.0032, + "num_tokens": 16025623.0, + "reward": 2.948993444442749, + "reward_std": 0.07517999410629272, + "rewards/reward_fn/mean": 2.948993444442749, + "rewards/reward_fn/std": 0.07518000900745392, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 92.25, + "completions/mean_terminated_length": 92.25, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.03840033945051448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.07063624216243625, + "learning_rate": 7.8556e-06, + "loss": 0.0028, + "num_tokens": 16066367.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 106.28125, + "completions/mean_terminated_length": 106.28125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.038506417736289385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.023690353438723832, + "learning_rate": 7.855199999999999e-06, + "loss": 0.0009, + "num_tokens": 16105928.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 235.1875, + "completions/mean_terminated_length": 235.1875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.038612496022064284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.06796126707922667, + "learning_rate": 7.8548e-06, + "loss": 0.0027, + "num_tokens": 16150414.0, + "reward": 3.9616689682006836, + "reward_std": 0.21683254837989807, + "rewards/reward_fn/mean": 3.9616689682006836, + "rewards/reward_fn/std": 0.21683259308338165, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 465.9375, + "completions/mean_terminated_length": 465.9375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.03871857430783918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.043375019915401936, + "learning_rate": 7.854399999999999e-06, + "loss": 0.0017, + "num_tokens": 16226796.0, + "reward": 3.8493480682373047, + "reward_std": 0.4049968421459198, + "rewards/reward_fn/mean": 3.8493480682373047, + "rewards/reward_fn/std": 0.4049968123435974, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 146.46875, + "completions/mean_terminated_length": 146.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.03882465259361409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.06945032655494288, + "learning_rate": 7.854e-06, + "loss": 0.0028, + "num_tokens": 16268891.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 274.15625, + "completions/mean_terminated_length": 274.15625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.03893073087938899, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.04315848933765665, + "learning_rate": 7.8536e-06, + "loss": 0.0017, + "num_tokens": 16311296.0, + "reward": 3.2710511684417725, + "reward_std": 0.4644310474395752, + "rewards/reward_fn/mean": 3.2710511684417725, + "rewards/reward_fn/std": 0.4644309878349304, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 167.78125, + "completions/mean_terminated_length": 167.78125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.03903680916516389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.051279135106597096, + "learning_rate": 7.8532e-06, + "loss": 0.0021, + "num_tokens": 16337785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 118.5, + "completions/mean_terminated_length": 118.5, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.039142887450938794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.09624231781344861, + "learning_rate": 7.8528e-06, + "loss": 0.0038, + "num_tokens": 16367145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 211.8125, + "completions/mean_terminated_length": 211.8125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.039248965736713694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.08367881097365171, + "learning_rate": 7.8524e-06, + "loss": 0.0033, + "num_tokens": 16403747.0, + "reward": 3.0466554164886475, + "reward_std": 0.030376369133591652, + "rewards/reward_fn/mean": 3.0466554164886475, + "rewards/reward_fn/std": 0.0303763709962368, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 148.3125, + "completions/mean_terminated_length": 148.3125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.0393550440224886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.05079342168755829, + "learning_rate": 7.852e-06, + "loss": 0.002, + "num_tokens": 16452813.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.0394611223082635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1884765625, + "kl": 0.0790958609431982, + "learning_rate": 7.8516e-06, + "loss": 0.0032, + "num_tokens": 16480749.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 172.09375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0395672005940384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.07499957724940032, + "learning_rate": 7.8512e-06, + "loss": 0.003, + "num_tokens": 16527984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 449.53125, + "completions/mean_terminated_length": 449.53125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.039673278879813305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.05416272731963545, + "learning_rate": 7.8508e-06, + "loss": 0.0022, + "num_tokens": 16589505.0, + "reward": 1.9449317455291748, + "reward_std": 0.4181321859359741, + "rewards/reward_fn/mean": 1.9449317455291748, + "rewards/reward_fn/std": 0.4181321859359741, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 230.84375, + "completions/mean_terminated_length": 230.84375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.039779357165588204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.08303264563437551, + "learning_rate": 7.850399999999999e-06, + "loss": 0.0033, + "num_tokens": 16641596.0, + "reward": 3.153766393661499, + "reward_std": 0.05732966959476471, + "rewards/reward_fn/mean": 3.153766393661499, + "rewards/reward_fn/std": 0.0573296882212162, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 213.19354248046875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.039885435451363103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.1146129370899871, + "learning_rate": 7.85e-06, + "loss": 0.0046, + "num_tokens": 16681421.0, + "reward": 2.9682884216308594, + "reward_std": 0.6505692005157471, + "rewards/reward_fn/mean": 2.9682884216308594, + "rewards/reward_fn/std": 0.6505692005157471, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 232.90625, + "completions/mean_terminated_length": 232.90625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.03999151373713801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.06879323907196522, + "learning_rate": 7.849599999999999e-06, + "loss": 0.0027, + "num_tokens": 16720330.0, + "reward": 2.8407626152038574, + "reward_std": 0.019035818055272102, + "rewards/reward_fn/mean": 2.8407626152038574, + "rewards/reward_fn/std": 0.019035782665014267, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 382.0, + "completions/mean_terminated_length": 382.0, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.04009759202291291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.053302777581848204, + "learning_rate": 7.8492e-06, + "loss": 0.0021, + "num_tokens": 16764714.0, + "reward": 2.8831186294555664, + "reward_std": 0.20725314319133759, + "rewards/reward_fn/mean": 2.8831186294555664, + "rewards/reward_fn/std": 0.2072531282901764, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1473.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 484.65625, + "completions/mean_terminated_length": 484.65625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.04020367030868781, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.04577759699895978, + "learning_rate": 7.8488e-06, + "loss": 0.0018, + "num_tokens": 16814047.0, + "reward": 3.974400520324707, + "reward_std": 0.144812673330307, + "rewards/reward_fn/mean": 3.974400520324707, + "rewards/reward_fn/std": 0.1448127031326294, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 769.53125, + "completions/mean_terminated_length": 684.300048828125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.040309748594462715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.058462157379835844, + "learning_rate": 7.8484e-06, + "loss": 0.0023, + "num_tokens": 16880560.0, + "reward": 2.516724109649658, + "reward_std": 0.7297559976577759, + "rewards/reward_fn/mean": 2.516724109649658, + "rewards/reward_fn/std": 0.7297559976577759, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1420.0, + "completions/max_terminated_length": 1420.0, + "completions/mean_length": 323.53125, + "completions/mean_terminated_length": 323.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.040415826880237614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.06136702420189977, + "learning_rate": 7.848e-06, + "loss": 0.0025, + "num_tokens": 16927777.0, + "reward": 3.8343114852905273, + "reward_std": 0.5490050911903381, + "rewards/reward_fn/mean": 3.8343114852905273, + "rewards/reward_fn/std": 0.5490050911903381, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 190.25, + "completions/mean_terminated_length": 190.25, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.04052190516601252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.04089883284177631, + "learning_rate": 7.8476e-06, + "loss": 0.0016, + "num_tokens": 16949001.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 165.78125, + "completions/mean_terminated_length": 165.78125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.04062798345178742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.03171997581375763, + "learning_rate": 7.8472e-06, + "loss": 0.0013, + "num_tokens": 16979586.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 920.875, + "completions/mean_terminated_length": 920.875, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.04073406173756232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.03487598354695365, + "learning_rate": 7.846799999999999e-06, + "loss": 0.0014, + "num_tokens": 17051486.0, + "reward": 2.8845808506011963, + "reward_std": 0.4707050025463104, + "rewards/reward_fn/mean": 2.8845808506011963, + "rewards/reward_fn/std": 0.47070497274398804, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 307.15625, + "completions/mean_terminated_length": 307.15625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.040840140023337225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.04412680730456486, + "learning_rate": 7.8464e-06, + "loss": 0.0018, + "num_tokens": 17096515.0, + "reward": 2.8527889251708984, + "reward_std": 0.02638748660683632, + "rewards/reward_fn/mean": 2.8527889251708984, + "rewards/reward_fn/std": 0.026387471705675125, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 451.25, + "completions/mean_terminated_length": 451.25, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.040946218309112124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04106189822778106, + "learning_rate": 7.845999999999999e-06, + "loss": 0.0016, + "num_tokens": 17144203.0, + "reward": 2.9010472297668457, + "reward_std": 0.03634057193994522, + "rewards/reward_fn/mean": 2.9010472297668457, + "rewards/reward_fn/std": 0.03634057566523552, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 442.40625, + "completions/mean_terminated_length": 442.40625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.041052296594887024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.05888936563860625, + "learning_rate": 7.8456e-06, + "loss": 0.0024, + "num_tokens": 17190904.0, + "reward": 2.7666423320770264, + "reward_std": 0.17257817089557648, + "rewards/reward_fn/mean": 2.7666423320770264, + "rewards/reward_fn/std": 0.17257815599441528, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 169.5625, + "completions/mean_terminated_length": 169.5625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.04115837488066193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.037654812389519066, + "learning_rate": 7.845199999999999e-06, + "loss": 0.0015, + "num_tokens": 17216970.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 105.46875, + "completions/mean_terminated_length": 105.46875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.04126445316643683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.046366847469471395, + "learning_rate": 7.8448e-06, + "loss": 0.0019, + "num_tokens": 17254361.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.041370531452211735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.047395337373018265, + "learning_rate": 7.8444e-06, + "loss": 0.0019, + "num_tokens": 17289285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 222.6875, + "completions/mean_terminated_length": 222.6875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.041476609737986635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.04588223801692948, + "learning_rate": 7.844e-06, + "loss": 0.0018, + "num_tokens": 17333147.0, + "reward": 3.4899792671203613, + "reward_std": 0.5518075227737427, + "rewards/reward_fn/mean": 3.4899792671203613, + "rewards/reward_fn/std": 0.5518075227737427, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.041582688023761534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.05707505450118333, + "learning_rate": 7.8436e-06, + "loss": 0.0023, + "num_tokens": 17370575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 111.40625, + "completions/mean_terminated_length": 111.40625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.04168876630953644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.036936537886504084, + "learning_rate": 7.8432e-06, + "loss": 0.0015, + "num_tokens": 17418748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 409.90625, + "completions/mean_terminated_length": 409.90625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.04179484459531134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.047665696358308196, + "learning_rate": 7.8428e-06, + "loss": 0.0019, + "num_tokens": 17465273.0, + "reward": 2.5570666790008545, + "reward_std": 0.24715420603752136, + "rewards/reward_fn/mean": 2.5570666790008545, + "rewards/reward_fn/std": 0.247154101729393, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 248.03125, + "completions/mean_terminated_length": 248.03125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.04190092288108624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.07105874724220484, + "learning_rate": 7.8424e-06, + "loss": 0.0028, + "num_tokens": 17503386.0, + "reward": 2.808501720428467, + "reward_std": 0.023406412452459335, + "rewards/reward_fn/mean": 2.808501720428467, + "rewards/reward_fn/std": 0.023406431078910828, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 247.6875, + "completions/mean_terminated_length": 247.6875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.042007001166861145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.04195737303234637, + "learning_rate": 7.841999999999999e-06, + "loss": 0.0017, + "num_tokens": 17542512.0, + "reward": 2.7778372764587402, + "reward_std": 0.22531868517398834, + "rewards/reward_fn/mean": 2.7778372764587402, + "rewards/reward_fn/std": 0.22531865537166595, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 78.34375, + "completions/mean_terminated_length": 78.34375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.042113079452636044, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.044936935941223055, + "learning_rate": 7.8416e-06, + "loss": 0.0018, + "num_tokens": 17579419.0, + "reward": 3.0178000926971436, + "reward_std": 0.008912133984267712, + "rewards/reward_fn/mean": 3.0178000926971436, + "rewards/reward_fn/std": 0.00891213957220316, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 376.65625, + "completions/mean_terminated_length": 376.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.04221915773841095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.046278940804768354, + "learning_rate": 7.841199999999999e-06, + "loss": 0.0019, + "num_tokens": 17624592.0, + "reward": 2.8509998321533203, + "reward_std": 0.06351502239704132, + "rewards/reward_fn/mean": 2.8509998321533203, + "rewards/reward_fn/std": 0.06351498514413834, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 275.40625, + "completions/mean_terminated_length": 275.40625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.04232523602418585, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.07029017817694694, + "learning_rate": 7.8408e-06, + "loss": 0.0028, + "num_tokens": 17684509.0, + "reward": 3.8989593982696533, + "reward_std": 0.3194463849067688, + "rewards/reward_fn/mean": 3.8989593982696533, + "rewards/reward_fn/std": 0.3194463551044464, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 455.0, + "completions/mean_terminated_length": 455.0, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.04243131430996075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.054996508290059865, + "learning_rate": 7.840399999999999e-06, + "loss": 0.0022, + "num_tokens": 17739645.0, + "reward": 3.0377087593078613, + "reward_std": 0.31760552525520325, + "rewards/reward_fn/mean": 3.0377087593078613, + "rewards/reward_fn/std": 0.31760555505752563, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 189.3125, + "completions/mean_terminated_length": 189.3125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.042537392595735656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.06360255589243025, + "learning_rate": 7.84e-06, + "loss": 0.0025, + "num_tokens": 17798055.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 244.84375, + "completions/mean_terminated_length": 244.84375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.042643470881510555, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.057871548808179796, + "learning_rate": 7.8396e-06, + "loss": 0.0023, + "num_tokens": 17840002.0, + "reward": 3.0404982566833496, + "reward_std": 0.36853575706481934, + "rewards/reward_fn/mean": 3.0404982566833496, + "rewards/reward_fn/std": 0.36853572726249695, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 244.84375, + "completions/mean_terminated_length": 244.84375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.042749549167285454, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3125, + "kl": 0.049547198228538036, + "learning_rate": 7.8392e-06, + "loss": 0.002, + "num_tokens": 17884829.0, + "reward": 2.909618377685547, + "reward_std": 0.01886744424700737, + "rewards/reward_fn/mean": 2.909618377685547, + "rewards/reward_fn/std": 0.018867461010813713, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 577.65625, + "completions/mean_terminated_length": 577.65625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.04285562745306036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.04880936082918197, + "learning_rate": 7.8388e-06, + "loss": 0.002, + "num_tokens": 17938258.0, + "reward": 2.958042621612549, + "reward_std": 0.6647323966026306, + "rewards/reward_fn/mean": 2.958042621612549, + "rewards/reward_fn/std": 0.6647323369979858, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 148.6875, + "completions/mean_terminated_length": 148.6875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.04296170573883526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.08973181049805135, + "learning_rate": 7.8384e-06, + "loss": 0.0036, + "num_tokens": 17989672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.04306778402461016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.07667279534507543, + "learning_rate": 7.838e-06, + "loss": 0.0031, + "num_tokens": 18012776.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 248.375, + "completions/mean_terminated_length": 248.375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.043173862310385065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.0704468511394225, + "learning_rate": 7.8376e-06, + "loss": 0.0028, + "num_tokens": 18057108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 240.4375, + "completions/mean_terminated_length": 240.4375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.043279940596159965, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.035191603121347725, + "learning_rate": 7.8372e-06, + "loss": 0.0014, + "num_tokens": 18145698.0, + "reward": 3.6485748291015625, + "reward_std": 0.8297750353813171, + "rewards/reward_fn/mean": 3.6485748291015625, + "rewards/reward_fn/std": 0.8297750353813171, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 147.96875, + "completions/mean_terminated_length": 147.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.04338601888193487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.07083654997404665, + "learning_rate": 7.8368e-06, + "loss": 0.0028, + "num_tokens": 18169281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.04349209716770977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.0727719494025223, + "learning_rate": 7.8364e-06, + "loss": 0.0029, + "num_tokens": 18211149.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 221.46875, + "completions/mean_terminated_length": 221.46875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.04359817545348467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.0683895600377582, + "learning_rate": 7.836e-06, + "loss": 0.0027, + "num_tokens": 18253244.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 71.5625, + "completions/mean_terminated_length": 71.5625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.043704253739259576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.034835965518141165, + "learning_rate": 7.8356e-06, + "loss": 0.0014, + "num_tokens": 18287022.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 397.03125, + "completions/mean_terminated_length": 397.03125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.043810332025034475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.07421079277992249, + "learning_rate": 7.8352e-06, + "loss": 0.003, + "num_tokens": 18335087.0, + "reward": 2.8003878593444824, + "reward_std": 0.02719496749341488, + "rewards/reward_fn/mean": 2.8003878593444824, + "rewards/reward_fn/std": 0.027194969356060028, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 126.5625, + "completions/mean_terminated_length": 126.5625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.043916410310809374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.08099708310328424, + "learning_rate": 7.8348e-06, + "loss": 0.0032, + "num_tokens": 18373889.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 160.53125, + "completions/mean_terminated_length": 160.53125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.04402248859658428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.07151383883319795, + "learning_rate": 7.834399999999999e-06, + "loss": 0.0029, + "num_tokens": 18411602.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 215.1875, + "completions/mean_terminated_length": 215.1875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.04412856688235918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.03778650192543864, + "learning_rate": 7.834e-06, + "loss": 0.0015, + "num_tokens": 18458776.0, + "reward": 3.094648838043213, + "reward_std": 0.09678830951452255, + "rewards/reward_fn/mean": 3.094648838043213, + "rewards/reward_fn/std": 0.09678832441568375, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 344.4375, + "completions/mean_terminated_length": 344.4375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.044234645168134086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.0643120965687558, + "learning_rate": 7.833599999999999e-06, + "loss": 0.0026, + "num_tokens": 18508710.0, + "reward": 1.7739882469177246, + "reward_std": 0.015309223905205727, + "rewards/reward_fn/mean": 1.7739882469177246, + "rewards/reward_fn/std": 0.015309221111238003, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 261.65625, + "completions/mean_terminated_length": 261.65625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.044340723453908985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.07708191289566457, + "learning_rate": 7.8332e-06, + "loss": 0.0031, + "num_tokens": 18552283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 255.5625, + "completions/mean_terminated_length": 255.5625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.044446801739683885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.030837459082249552, + "learning_rate": 7.832799999999999e-06, + "loss": 0.0012, + "num_tokens": 18593517.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.04455288002545879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.07660691998898983, + "learning_rate": 7.8324e-06, + "loss": 0.0031, + "num_tokens": 18633038.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 147.34375, + "completions/mean_terminated_length": 147.34375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.04465895831123369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.062265314627438784, + "learning_rate": 7.831999999999999e-06, + "loss": 0.0025, + "num_tokens": 18662009.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 167.34375, + "completions/mean_terminated_length": 167.34375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.04476503659700859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.038907009293325245, + "learning_rate": 7.8316e-06, + "loss": 0.0016, + "num_tokens": 18715684.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 165.1875, + "completions/mean_terminated_length": 165.1875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.044871114882783496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.045978159294463694, + "learning_rate": 7.831199999999999e-06, + "loss": 0.0018, + "num_tokens": 18759914.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 223.0625, + "completions/mean_terminated_length": 223.0625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.044977193168558395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.06679196062032133, + "learning_rate": 7.8308e-06, + "loss": 0.0027, + "num_tokens": 18810924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 413.96875, + "completions/mean_terminated_length": 413.96875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.0450832714543333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.050274993176572025, + "learning_rate": 7.8304e-06, + "loss": 0.002, + "num_tokens": 18858699.0, + "reward": 3.965904474258423, + "reward_std": 0.1928737610578537, + "rewards/reward_fn/mean": 3.965904474258423, + "rewards/reward_fn/std": 0.1928737461566925, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 98.21875, + "completions/mean_terminated_length": 98.21875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.0451893497401082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1708984375, + "kl": 0.03456445218762383, + "learning_rate": 7.83e-06, + "loss": 0.0014, + "num_tokens": 18906642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.0452954280258831, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.07573176105506718, + "learning_rate": 7.8296e-06, + "loss": 0.003, + "num_tokens": 18946129.0, + "reward": 2.3788418769836426, + "reward_std": 0.5687231421470642, + "rewards/reward_fn/mean": 2.3788418769836426, + "rewards/reward_fn/std": 0.5687231421470642, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 160.5, + "completions/mean_terminated_length": 160.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.045401506311658006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.06303711561486125, + "learning_rate": 7.8292e-06, + "loss": 0.0025, + "num_tokens": 18986625.0, + "reward": 3.611386775970459, + "reward_std": 0.5146521925926208, + "rewards/reward_fn/mean": 3.611386775970459, + "rewards/reward_fn/std": 0.5146521925926208, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 254.53125, + "completions/mean_terminated_length": 254.53125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.045507584597432905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.040832315338775516, + "learning_rate": 7.8288e-06, + "loss": 0.0016, + "num_tokens": 19041426.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 800.96875, + "completions/mean_terminated_length": 800.96875, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.045613662883207805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.055984109407290816, + "learning_rate": 7.8284e-06, + "loss": 0.0022, + "num_tokens": 19111281.0, + "reward": 2.264820098876953, + "reward_std": 0.5752713084220886, + "rewards/reward_fn/mean": 2.264820098876953, + "rewards/reward_fn/std": 0.5752713084220886, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.04571974116898271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.0701156510040164, + "learning_rate": 7.828e-06, + "loss": 0.0028, + "num_tokens": 19154909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 200.21875, + "completions/mean_terminated_length": 200.21875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.04582581945475761, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.04736822139238939, + "learning_rate": 7.8276e-06, + "loss": 0.0019, + "num_tokens": 19181796.0, + "reward": 3.932534694671631, + "reward_std": 0.26547369360923767, + "rewards/reward_fn/mean": 3.932534694671631, + "rewards/reward_fn/std": 0.26547372341156006, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 300.53125, + "completions/mean_terminated_length": 300.53125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.04593189774053251, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.029562294948846102, + "learning_rate": 7.8272e-06, + "loss": 0.0012, + "num_tokens": 19240213.0, + "reward": 2.923275947570801, + "reward_std": 0.02933095581829548, + "rewards/reward_fn/mean": 2.923275947570801, + "rewards/reward_fn/std": 0.02933092787861824, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 102.0, + "completions/mean_terminated_length": 102.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.046037976026307416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.0631599115440622, + "learning_rate": 7.8268e-06, + "loss": 0.0025, + "num_tokens": 19266869.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 203.8125, + "completions/mean_terminated_length": 203.8125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.046144054312082315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.09093636996112764, + "learning_rate": 7.826399999999998e-06, + "loss": 0.0036, + "num_tokens": 19304239.0, + "reward": 3.0638785362243652, + "reward_std": 0.0846947655081749, + "rewards/reward_fn/mean": 3.0638785362243652, + "rewards/reward_fn/std": 0.0846947655081749, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 291.625, + "completions/mean_terminated_length": 291.625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.04625013259785722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0460854871198535, + "learning_rate": 7.826e-06, + "loss": 0.0018, + "num_tokens": 19357891.0, + "reward": 3.0054774284362793, + "reward_std": 0.10288947075605392, + "rewards/reward_fn/mean": 3.0054774284362793, + "rewards/reward_fn/std": 0.10288945585489273, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 255.4375, + "completions/mean_terminated_length": 255.4375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.04635621088363212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.05067626805976033, + "learning_rate": 7.8256e-06, + "loss": 0.002, + "num_tokens": 19417105.0, + "reward": 3.9661037921905518, + "reward_std": 0.1917456090450287, + "rewards/reward_fn/mean": 3.9661037921905518, + "rewards/reward_fn/std": 0.1917456090450287, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 306.0625, + "completions/mean_terminated_length": 306.0625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.04646228916940702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.060956618282943964, + "learning_rate": 7.8252e-06, + "loss": 0.0024, + "num_tokens": 19462579.0, + "reward": 3.7817397117614746, + "reward_std": 0.3843442499637604, + "rewards/reward_fn/mean": 3.7817397117614746, + "rewards/reward_fn/std": 0.38434427976608276, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.046568367455181926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.051055049581918865, + "learning_rate": 7.8248e-06, + "loss": 0.002, + "num_tokens": 19502815.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 156.375, + "completions/mean_terminated_length": 156.375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.046674445740956826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.0625815651146695, + "learning_rate": 7.824399999999999e-06, + "loss": 0.0025, + "num_tokens": 19548203.0, + "reward": 3.163956642150879, + "reward_std": 0.020623939111828804, + "rewards/reward_fn/mean": 3.163956642150879, + "rewards/reward_fn/std": 0.020623959600925446, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 150.8125, + "completions/mean_terminated_length": 150.8125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.046780524026731725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.06158293504267931, + "learning_rate": 7.824e-06, + "loss": 0.0025, + "num_tokens": 19589381.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 133.1875, + "completions/mean_terminated_length": 133.1875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.04688660231250663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.05522835114970803, + "learning_rate": 7.823599999999999e-06, + "loss": 0.0022, + "num_tokens": 19629899.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 265.40625, + "completions/mean_terminated_length": 265.40625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.04699268059828153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.054093458864372224, + "learning_rate": 7.8232e-06, + "loss": 0.0022, + "num_tokens": 19673176.0, + "reward": 3.9060635566711426, + "reward_std": 0.2969805896282196, + "rewards/reward_fn/mean": 3.9060635566711426, + "rewards/reward_fn/std": 0.29698053002357483, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 266.53125, + "completions/mean_terminated_length": 266.53125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.04709875888405644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11572265625, + "kl": 0.03358338208636269, + "learning_rate": 7.822799999999999e-06, + "loss": 0.0013, + "num_tokens": 19714153.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.047204837169831336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.06025209149811417, + "learning_rate": 7.8224e-06, + "loss": 0.0024, + "num_tokens": 19752198.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 170.59375, + "completions/mean_terminated_length": 170.59375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.047310915455606235, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.03811126423534006, + "learning_rate": 7.821999999999999e-06, + "loss": 0.0015, + "num_tokens": 19774713.0, + "reward": 3.1091926097869873, + "reward_std": 0.03035632148385048, + "rewards/reward_fn/mean": 3.1091926097869873, + "rewards/reward_fn/std": 0.030356300994753838, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 382.125, + "completions/mean_terminated_length": 382.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.04741699374138114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.038349084730725735, + "learning_rate": 7.8216e-06, + "loss": 0.0015, + "num_tokens": 19808125.0, + "reward": 2.8521904945373535, + "reward_std": 0.07935212552547455, + "rewards/reward_fn/mean": 2.8521904945373535, + "rewards/reward_fn/std": 0.07935213297605515, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 138.40625, + "completions/mean_terminated_length": 138.40625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.04752307202715604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.053723981720395386, + "learning_rate": 7.8212e-06, + "loss": 0.0021, + "num_tokens": 19851658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04762915031293094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.05036897730315104, + "learning_rate": 7.8208e-06, + "loss": 0.002, + "num_tokens": 19909602.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 83.46875, + "completions/mean_terminated_length": 83.46875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.047735228598705846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.019947283988585696, + "learning_rate": 7.8204e-06, + "loss": 0.0008, + "num_tokens": 19944849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 101.125, + "completions/mean_terminated_length": 101.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.047841306884480746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.03120685095200315, + "learning_rate": 7.82e-06, + "loss": 0.0012, + "num_tokens": 19989013.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 483.8125, + "completions/mean_terminated_length": 483.8125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.04794738517025565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.06700396462110803, + "learning_rate": 7.8196e-06, + "loss": 0.0027, + "num_tokens": 20041839.0, + "reward": 2.9224772453308105, + "reward_std": 0.4150664508342743, + "rewards/reward_fn/mean": 2.9224772453308105, + "rewards/reward_fn/std": 0.4150664508342743, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 179.53125, + "completions/mean_terminated_length": 179.53125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.04805346345603055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.060416163643822074, + "learning_rate": 7.8192e-06, + "loss": 0.0024, + "num_tokens": 20064992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.04815954174180545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.06531961727887392, + "learning_rate": 7.8188e-06, + "loss": 0.0026, + "num_tokens": 20108816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 160.03125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.04826562002758036, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.058511899900622666, + "learning_rate": 7.8184e-06, + "loss": 0.0023, + "num_tokens": 20144977.0, + "reward": 2.8220136165618896, + "reward_std": 0.04616091400384903, + "rewards/reward_fn/mean": 2.8220136165618896, + "rewards/reward_fn/std": 0.046160902827978134, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 126.15625, + "completions/mean_terminated_length": 126.15625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.048371698313355256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.345703125, + "kl": 0.02876890735933557, + "learning_rate": 7.817999999999999e-06, + "loss": 0.0012, + "num_tokens": 20197078.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 100.84375, + "completions/mean_terminated_length": 100.84375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.048477776599130155, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.039327553124167025, + "learning_rate": 7.8176e-06, + "loss": 0.0016, + "num_tokens": 20234897.0, + "reward": 3.0898919105529785, + "reward_std": 0.03461963310837746, + "rewards/reward_fn/mean": 3.0898919105529785, + "rewards/reward_fn/std": 0.03461962565779686, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 442.59375, + "completions/mean_terminated_length": 442.59375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.04858385488490506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.05146264052018523, + "learning_rate": 7.817199999999999e-06, + "loss": 0.0021, + "num_tokens": 20283204.0, + "reward": 3.024125099182129, + "reward_std": 0.13125301897525787, + "rewards/reward_fn/mean": 3.024125099182129, + "rewards/reward_fn/std": 0.13125301897525787, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 178.5625, + "completions/mean_terminated_length": 178.5625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.04868993317067996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.0506303379079327, + "learning_rate": 7.8168e-06, + "loss": 0.002, + "num_tokens": 20332694.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 189.59375, + "completions/mean_terminated_length": 189.59375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.04879601145645486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.037065216922201216, + "learning_rate": 7.8164e-06, + "loss": 0.0015, + "num_tokens": 20377769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 274.0625, + "completions/mean_terminated_length": 274.0625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.048902089742229767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.03126209176843986, + "learning_rate": 7.816e-06, + "loss": 0.0013, + "num_tokens": 20428491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 248.71875, + "completions/mean_terminated_length": 248.71875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.049008168028004666, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.05388626334024593, + "learning_rate": 7.8156e-06, + "loss": 0.0022, + "num_tokens": 20460482.0, + "reward": 2.950096607208252, + "reward_std": 0.07136090844869614, + "rewards/reward_fn/mean": 2.950096607208252, + "rewards/reward_fn/std": 0.07136087864637375, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 348.0625, + "completions/mean_terminated_length": 348.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.04911424631377957, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.04740521451458335, + "learning_rate": 7.8152e-06, + "loss": 0.0019, + "num_tokens": 20493988.0, + "reward": 2.8588528633117676, + "reward_std": 0.027407808229327202, + "rewards/reward_fn/mean": 2.8588528633117676, + "rewards/reward_fn/std": 0.027407843619585037, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 387.84375, + "completions/mean_terminated_length": 387.84375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.04922032459955447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.048984733468387276, + "learning_rate": 7.8148e-06, + "loss": 0.002, + "num_tokens": 20542143.0, + "reward": 2.9211349487304688, + "reward_std": 0.02732696197926998, + "rewards/reward_fn/mean": 2.9211349487304688, + "rewards/reward_fn/std": 0.02732696942985058, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 484.40625, + "completions/mean_terminated_length": 484.40625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.04932640288532937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.04254200303694233, + "learning_rate": 7.8144e-06, + "loss": 0.0017, + "num_tokens": 20600140.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 235.9375, + "completions/mean_terminated_length": 235.9375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.04943248117110428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.04190182563615963, + "learning_rate": 7.814e-06, + "loss": 0.0017, + "num_tokens": 20640522.0, + "reward": 1.7360775470733643, + "reward_std": 0.008762822486460209, + "rewards/reward_fn/mean": 1.7360775470733643, + "rewards/reward_fn/std": 0.008762827143073082, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 77.65625, + "completions/mean_terminated_length": 77.65625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.049538559456879176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.017409066756954417, + "learning_rate": 7.8136e-06, + "loss": 0.0007, + "num_tokens": 20680607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 457.625, + "completions/mean_terminated_length": 457.625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.049644637742654076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.05563335045007989, + "learning_rate": 7.8132e-06, + "loss": 0.0022, + "num_tokens": 20725331.0, + "reward": 3.640636920928955, + "reward_std": 0.6801393032073975, + "rewards/reward_fn/mean": 3.640636920928955, + "rewards/reward_fn/std": 0.6801392436027527, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 187.3125, + "completions/mean_terminated_length": 187.3125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.04975071602842898, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.04255328554427251, + "learning_rate": 7.812799999999999e-06, + "loss": 0.0017, + "num_tokens": 20765597.0, + "reward": 3.0404469966888428, + "reward_std": 0.4203813970088959, + "rewards/reward_fn/mean": 3.0404469966888428, + "rewards/reward_fn/std": 0.4203813970088959, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 157.09375, + "completions/mean_terminated_length": 157.09375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.04985679431420388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.0492172134690918, + "learning_rate": 7.8124e-06, + "loss": 0.002, + "num_tokens": 20803616.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 302.71875, + "completions/mean_terminated_length": 302.71875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.04996287259997879, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.0402672459022142, + "learning_rate": 7.812e-06, + "loss": 0.0016, + "num_tokens": 20853207.0, + "reward": 2.911402702331543, + "reward_std": 0.01763262040913105, + "rewards/reward_fn/mean": 2.911402702331543, + "rewards/reward_fn/std": 0.017632605507969856, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.05006895088575369, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.05370669817784801, + "learning_rate": 7.8116e-06, + "loss": 0.0021, + "num_tokens": 20906767.0, + "reward": 3.9612836837768555, + "reward_std": 0.2190132439136505, + "rewards/reward_fn/mean": 3.9612836837768555, + "rewards/reward_fn/std": 0.21901322901248932, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.050175029171528586, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.04981033824151382, + "learning_rate": 7.8112e-06, + "loss": 0.002, + "num_tokens": 20952940.0, + "reward": 3.745567798614502, + "reward_std": 0.4885614812374115, + "rewards/reward_fn/mean": 3.745567798614502, + "rewards/reward_fn/std": 0.4885614812374115, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 149.15625, + "completions/mean_terminated_length": 149.15625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.05028110745730349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.031192386173643172, + "learning_rate": 7.8108e-06, + "loss": 0.0012, + "num_tokens": 20996337.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 93.3125, + "completions/mean_terminated_length": 93.3125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.05038718574307839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.02836272062268108, + "learning_rate": 7.810399999999999e-06, + "loss": 0.0011, + "num_tokens": 21020795.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 220.6875, + "completions/mean_terminated_length": 220.6875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.05049326402885329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.048613334831316024, + "learning_rate": 7.81e-06, + "loss": 0.002, + "num_tokens": 21054545.0, + "reward": 3.040393829345703, + "reward_std": 0.01075148768723011, + "rewards/reward_fn/mean": 3.040393829345703, + "rewards/reward_fn/std": 0.010751496069133282, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.0505993423146282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.05732053401879966, + "learning_rate": 7.809599999999999e-06, + "loss": 0.0023, + "num_tokens": 21105821.0, + "reward": 2.80159592628479, + "reward_std": 0.06686852127313614, + "rewards/reward_fn/mean": 2.80159592628479, + "rewards/reward_fn/std": 0.06686852127313614, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.050705420600403096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.030846741399727762, + "learning_rate": 7.8092e-06, + "loss": 0.0012, + "num_tokens": 21144581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 317.375, + "completions/mean_terminated_length": 317.375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.050811498886178, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.03433474572375417, + "learning_rate": 7.808799999999999e-06, + "loss": 0.0014, + "num_tokens": 21189425.0, + "reward": 3.131831169128418, + "reward_std": 0.16248132288455963, + "rewards/reward_fn/mean": 3.131831169128418, + "rewards/reward_fn/std": 0.16248130798339844, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 80.34375, + "completions/mean_terminated_length": 80.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.0509175771719529, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1875, + "kl": 0.03051563521148637, + "learning_rate": 7.8084e-06, + "loss": 0.0012, + "num_tokens": 21236604.0, + "reward": 3.8580493927001953, + "reward_std": 0.30022406578063965, + "rewards/reward_fn/mean": 3.8580493927001953, + "rewards/reward_fn/std": 0.30022403597831726, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 125.5, + "completions/mean_terminated_length": 125.5, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.0510236554577278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.04617619724012911, + "learning_rate": 7.807999999999999e-06, + "loss": 0.0018, + "num_tokens": 21271788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 157.9375, + "completions/mean_terminated_length": 157.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.05112973374350271, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.06474354478996247, + "learning_rate": 7.8076e-06, + "loss": 0.0026, + "num_tokens": 21310826.0, + "reward": 3.430511474609375, + "reward_std": 0.5437823534011841, + "rewards/reward_fn/mean": 3.430511474609375, + "rewards/reward_fn/std": 0.5437823534011841, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 218.75, + "completions/mean_terminated_length": 218.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.05123581202927761, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.036409151420230046, + "learning_rate": 7.8072e-06, + "loss": 0.0015, + "num_tokens": 21356002.0, + "reward": 1.8208963871002197, + "reward_std": 0.017528312280774117, + "rewards/reward_fn/mean": 1.8208963871002197, + "rewards/reward_fn/std": 0.01752830669283867, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 209.4375, + "completions/mean_terminated_length": 209.4375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.051341890315052506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.03509134179330431, + "learning_rate": 7.8068e-06, + "loss": 0.0014, + "num_tokens": 21402096.0, + "reward": 3.185612916946411, + "reward_std": 0.013976151123642921, + "rewards/reward_fn/mean": 3.185612916946411, + "rewards/reward_fn/std": 0.0139761408790946, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 182.59375, + "completions/mean_terminated_length": 182.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05144796860082741, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.048504149657674134, + "learning_rate": 7.8064e-06, + "loss": 0.0019, + "num_tokens": 21441379.0, + "reward": 2.9261727333068848, + "reward_std": 0.19896990060806274, + "rewards/reward_fn/mean": 2.9261727333068848, + "rewards/reward_fn/std": 0.19896981120109558, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 251.625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.05155404688660231, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.04867141193244606, + "learning_rate": 7.806e-06, + "loss": 0.0019, + "num_tokens": 21483063.0, + "reward": 3.803684949874878, + "reward_std": 0.8027096390724182, + "rewards/reward_fn/mean": 3.803684949874878, + "rewards/reward_fn/std": 0.802709698677063, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 131.5625, + "completions/mean_terminated_length": 131.5625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.05166012517237722, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.05726821324788034, + "learning_rate": 7.8056e-06, + "loss": 0.0023, + "num_tokens": 21521001.0, + "reward": 3.840595245361328, + "reward_std": 0.3371758460998535, + "rewards/reward_fn/mean": 3.840595245361328, + "rewards/reward_fn/std": 0.3371758460998535, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 406.90625, + "completions/mean_terminated_length": 406.90625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.05176620345815212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.043418677465524524, + "learning_rate": 7.8052e-06, + "loss": 0.0017, + "num_tokens": 21585126.0, + "reward": 3.6899161338806152, + "reward_std": 0.5464287996292114, + "rewards/reward_fn/mean": 3.6899161338806152, + "rewards/reward_fn/std": 0.5464287996292114, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 450.8125, + "completions/mean_terminated_length": 450.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.051872281743927016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.05789197108242661, + "learning_rate": 7.8048e-06, + "loss": 0.0023, + "num_tokens": 21633888.0, + "reward": 2.7008860111236572, + "reward_std": 0.2668880224227905, + "rewards/reward_fn/mean": 2.7008860111236572, + "rewards/reward_fn/std": 0.2668880224227905, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 379.5, + "completions/mean_terminated_length": 379.5, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.05197836002970192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.04075531323906034, + "learning_rate": 7.8044e-06, + "loss": 0.0016, + "num_tokens": 21696080.0, + "reward": 2.7861926555633545, + "reward_std": 0.3229799270629883, + "rewards/reward_fn/mean": 2.7861926555633545, + "rewards/reward_fn/std": 0.3229798674583435, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 364.46875, + "completions/mean_terminated_length": 364.46875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.05208443831547682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.044861781294457614, + "learning_rate": 7.804e-06, + "loss": 0.0018, + "num_tokens": 21749247.0, + "reward": 2.9800658226013184, + "reward_std": 0.3406826853752136, + "rewards/reward_fn/mean": 2.9800658226013184, + "rewards/reward_fn/std": 0.3406826853752136, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 166.28125, + "completions/mean_terminated_length": 166.28125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.05219051660125172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.05009030376095325, + "learning_rate": 7.8036e-06, + "loss": 0.002, + "num_tokens": 21788488.0, + "reward": 2.9894442558288574, + "reward_std": 0.013725874945521355, + "rewards/reward_fn/mean": 2.9894442558288574, + "rewards/reward_fn/std": 0.013725854456424713, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 186.28125, + "completions/mean_terminated_length": 186.28125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.05229659488702663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.04771283362060785, + "learning_rate": 7.8032e-06, + "loss": 0.0019, + "num_tokens": 21830353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 126.03125, + "completions/mean_terminated_length": 126.03125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.05240267317280153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.05836413930228446, + "learning_rate": 7.8028e-06, + "loss": 0.0023, + "num_tokens": 21871282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 256.28125, + "completions/mean_terminated_length": 256.28125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.052508751458576426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.08196262107230723, + "learning_rate": 7.8024e-06, + "loss": 0.0033, + "num_tokens": 21915227.0, + "reward": 3.26692795753479, + "reward_std": 0.4324604272842407, + "rewards/reward_fn/mean": 3.26692795753479, + "rewards/reward_fn/std": 0.4324604272842407, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 201.96875, + "completions/mean_terminated_length": 201.96875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.05261482974435133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.06773088849149644, + "learning_rate": 7.802e-06, + "loss": 0.0027, + "num_tokens": 21968250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 460.6875, + "completions/mean_terminated_length": 460.6875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.05272090803012623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04177248413907364, + "learning_rate": 7.8016e-06, + "loss": 0.0017, + "num_tokens": 22026704.0, + "reward": 2.7659690380096436, + "reward_std": 0.25846555829048157, + "rewards/reward_fn/mean": 2.7659690380096436, + "rewards/reward_fn/std": 0.25846555829048157, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 604.03125, + "completions/mean_terminated_length": 604.03125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.05282698631590114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.07456690678372979, + "learning_rate": 7.801199999999999e-06, + "loss": 0.003, + "num_tokens": 22081009.0, + "reward": 2.7125933170318604, + "reward_std": 0.33740636706352234, + "rewards/reward_fn/mean": 2.7125933170318604, + "rewards/reward_fn/std": 0.3374064266681671, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.05293306460167604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.036616402619984, + "learning_rate": 7.8008e-06, + "loss": 0.0015, + "num_tokens": 22115626.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 254.90625, + "completions/mean_terminated_length": 254.90625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.05303914288745094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.05103367427363992, + "learning_rate": 7.800399999999999e-06, + "loss": 0.002, + "num_tokens": 22166055.0, + "reward": 3.9605257511138916, + "reward_std": 0.22330042719841003, + "rewards/reward_fn/mean": 3.9605257511138916, + "rewards/reward_fn/std": 0.22330045700073242, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 57.78125, + "completions/mean_terminated_length": 57.78125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.05314522117322584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.020510083792032674, + "learning_rate": 7.8e-06, + "loss": 0.0008, + "num_tokens": 22186080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 348.09375, + "completions/mean_terminated_length": 348.09375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.05325129945900074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.06456627510488033, + "learning_rate": 7.799599999999999e-06, + "loss": 0.0026, + "num_tokens": 22238659.0, + "reward": 3.9281535148620605, + "reward_std": 0.2827524244785309, + "rewards/reward_fn/mean": 3.9281535148620605, + "rewards/reward_fn/std": 0.2827524244785309, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 160.71875, + "completions/mean_terminated_length": 160.71875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.05335737774477564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.07040900446008891, + "learning_rate": 7.7992e-06, + "loss": 0.0028, + "num_tokens": 22278650.0, + "reward": 3.0478758811950684, + "reward_std": 0.008108945563435555, + "rewards/reward_fn/mean": 3.0478758811950684, + "rewards/reward_fn/std": 0.008108980022370815, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 233.65625, + "completions/mean_terminated_length": 233.65625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.05346345603055055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.06506969581823796, + "learning_rate": 7.798799999999999e-06, + "loss": 0.0026, + "num_tokens": 22320079.0, + "reward": 3.0991227626800537, + "reward_std": 0.24598754942417145, + "rewards/reward_fn/mean": 3.0991227626800537, + "rewards/reward_fn/std": 0.24598753452301025, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 415.71875, + "completions/mean_terminated_length": 415.71875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.05356953431632545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.04956794378813356, + "learning_rate": 7.7984e-06, + "loss": 0.002, + "num_tokens": 22381862.0, + "reward": 3.519956111907959, + "reward_std": 0.5920132398605347, + "rewards/reward_fn/mean": 3.519956111907959, + "rewards/reward_fn/std": 0.5920132398605347, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 229.46875, + "completions/mean_terminated_length": 229.46875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.05367561260210035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.03545730945188552, + "learning_rate": 7.797999999999999e-06, + "loss": 0.0014, + "num_tokens": 22410741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 264.4375, + "completions/mean_terminated_length": 264.4375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.05378169088787525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.05536438780836761, + "learning_rate": 7.7976e-06, + "loss": 0.0022, + "num_tokens": 22459363.0, + "reward": 3.8821825981140137, + "reward_std": 0.3202844560146332, + "rewards/reward_fn/mean": 3.8821825981140137, + "rewards/reward_fn/std": 0.3202844262123108, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.05388776917365015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.05457607749849558, + "learning_rate": 7.7972e-06, + "loss": 0.0022, + "num_tokens": 22502848.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 255.75, + "completions/mean_terminated_length": 255.75, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.05399384745942506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.04397421167232096, + "learning_rate": 7.7968e-06, + "loss": 0.0018, + "num_tokens": 22555672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 119.90625, + "completions/mean_terminated_length": 119.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.05409992574519996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.0700938591035083, + "learning_rate": 7.7964e-06, + "loss": 0.0028, + "num_tokens": 22583125.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 176.21875, + "completions/mean_terminated_length": 176.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.05420600403097486, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.06420622754376382, + "learning_rate": 7.796e-06, + "loss": 0.0026, + "num_tokens": 22631228.0, + "reward": 2.773768424987793, + "reward_std": 0.05320408195257187, + "rewards/reward_fn/mean": 2.773768424987793, + "rewards/reward_fn/std": 0.053204067051410675, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 243.6875, + "completions/mean_terminated_length": 243.6875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.05431208231674976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.05076413287315518, + "learning_rate": 7.7956e-06, + "loss": 0.002, + "num_tokens": 22689362.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 187.03125, + "completions/mean_terminated_length": 187.03125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.05441816060252466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.04727089777588844, + "learning_rate": 7.7952e-06, + "loss": 0.0019, + "num_tokens": 22728115.0, + "reward": 2.8927512168884277, + "reward_std": 0.29177990555763245, + "rewards/reward_fn/mean": 2.8927512168884277, + "rewards/reward_fn/std": 0.29177987575531006, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 131.03125, + "completions/mean_terminated_length": 131.03125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.05452423888829957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.06265121378237382, + "learning_rate": 7.7948e-06, + "loss": 0.0025, + "num_tokens": 22761108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 106.3125, + "completions/mean_terminated_length": 106.3125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.05463031717407447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.010751996480394155, + "learning_rate": 7.7944e-06, + "loss": 0.0004, + "num_tokens": 22803678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1898.0, + "completions/mean_length": 581.75, + "completions/mean_terminated_length": 534.4515991210938, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.05473639545984937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.057617883081547916, + "learning_rate": 7.793999999999999e-06, + "loss": 0.0023, + "num_tokens": 22881142.0, + "reward": 3.3143372535705566, + "reward_std": 1.1114939451217651, + "rewards/reward_fn/mean": 3.3143372535705566, + "rewards/reward_fn/std": 1.1114939451217651, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 250.5625, + "completions/mean_terminated_length": 250.5625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.05484247374562427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05981055251322687, + "learning_rate": 7.7936e-06, + "loss": 0.0024, + "num_tokens": 22919592.0, + "reward": 2.810237407684326, + "reward_std": 0.2666741907596588, + "rewards/reward_fn/mean": 2.810237407684326, + "rewards/reward_fn/std": 0.2666742503643036, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 82.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.05494855203139917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.014161759259877726, + "learning_rate": 7.793199999999999e-06, + "loss": 0.0006, + "num_tokens": 22957280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 151.59375, + "completions/mean_terminated_length": 151.59375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.05505463031717407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.03626142651773989, + "learning_rate": 7.7928e-06, + "loss": 0.0015, + "num_tokens": 22993619.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 142.53125, + "completions/mean_terminated_length": 142.53125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.05516070860294898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.03131835992098786, + "learning_rate": 7.7924e-06, + "loss": 0.0013, + "num_tokens": 23031588.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 221.3125, + "completions/mean_terminated_length": 221.3125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.05526678688872388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.05314846872352064, + "learning_rate": 7.792e-06, + "loss": 0.0021, + "num_tokens": 23075534.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 186.15625, + "completions/mean_terminated_length": 186.15625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.05537286517449878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.06087932689115405, + "learning_rate": 7.7916e-06, + "loss": 0.0024, + "num_tokens": 23131827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 508.03125, + "completions/mean_terminated_length": 458.3548278808594, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.05547894346027368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.045374272391200066, + "learning_rate": 7.7912e-06, + "loss": 0.0018, + "num_tokens": 23196692.0, + "reward": 2.6854634284973145, + "reward_std": 0.5392786860466003, + "rewards/reward_fn/mean": 2.6854634284973145, + "rewards/reward_fn/std": 0.5392786264419556, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 203.15625, + "completions/mean_terminated_length": 203.15625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.05558502174604858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.05111256393138319, + "learning_rate": 7.7908e-06, + "loss": 0.002, + "num_tokens": 23218713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 112.96875, + "completions/mean_terminated_length": 112.96875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.05569110003182349, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.05130923929391429, + "learning_rate": 7.790399999999999e-06, + "loss": 0.0021, + "num_tokens": 23259032.0, + "reward": 3.9281158447265625, + "reward_std": 0.4066387414932251, + "rewards/reward_fn/mean": 3.9281158447265625, + "rewards/reward_fn/std": 0.4066386818885803, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 173.28125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.05579717831759839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.0680079300655052, + "learning_rate": 7.79e-06, + "loss": 0.0027, + "num_tokens": 23297473.0, + "reward": 2.882020950317383, + "reward_std": 0.01828574575483799, + "rewards/reward_fn/mean": 2.882020950317383, + "rewards/reward_fn/std": 0.01828572154045105, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 455.5, + "completions/mean_terminated_length": 455.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.05590325660337329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.06674273777753115, + "learning_rate": 7.789599999999999e-06, + "loss": 0.0027, + "num_tokens": 23376529.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.056009334889148193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.04751398117514327, + "learning_rate": 7.7892e-06, + "loss": 0.0019, + "num_tokens": 23429757.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 369.78125, + "completions/mean_terminated_length": 369.78125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.05611541317492309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05224546114914119, + "learning_rate": 7.788799999999999e-06, + "loss": 0.0021, + "num_tokens": 23480214.0, + "reward": 2.8701210021972656, + "reward_std": 0.0751161128282547, + "rewards/reward_fn/mean": 2.8701210021972656, + "rewards/reward_fn/std": 0.0751161202788353, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 263.3125, + "completions/mean_terminated_length": 263.3125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.05622149146069799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.03188524296274409, + "learning_rate": 7.7884e-06, + "loss": 0.0013, + "num_tokens": 23523424.0, + "reward": 2.912999153137207, + "reward_std": 0.01211194321513176, + "rewards/reward_fn/mean": 2.912999153137207, + "rewards/reward_fn/std": 0.012111921794712543, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 343.875, + "completions/mean_terminated_length": 343.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0563275697464729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.06691422103904188, + "learning_rate": 7.788e-06, + "loss": 0.0027, + "num_tokens": 23583132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 264.15625, + "completions/mean_terminated_length": 264.15625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.0564336480322478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.041861614503432065, + "learning_rate": 7.7876e-06, + "loss": 0.0017, + "num_tokens": 23625217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.056539726318022704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.045831574767362326, + "learning_rate": 7.7872e-06, + "loss": 0.0018, + "num_tokens": 23666861.0, + "reward": 3.0297441482543945, + "reward_std": 0.04320033639669418, + "rewards/reward_fn/mean": 3.0297441482543945, + "rewards/reward_fn/std": 0.04320032149553299, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 250.34375, + "completions/mean_terminated_length": 250.34375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.0566458046037976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.03298926952993497, + "learning_rate": 7.7868e-06, + "loss": 0.0013, + "num_tokens": 23695864.0, + "reward": 2.8847732543945312, + "reward_std": 0.0448918342590332, + "rewards/reward_fn/mean": 2.8847732543945312, + "rewards/reward_fn/std": 0.0448918342590332, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 173.5625, + "completions/mean_terminated_length": 173.5625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.0567518828895725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.04681810602778569, + "learning_rate": 7.7864e-06, + "loss": 0.0019, + "num_tokens": 23729130.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 305.8125, + "completions/mean_terminated_length": 305.8125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.05685796117534741, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.0402856832370162, + "learning_rate": 7.786e-06, + "loss": 0.0016, + "num_tokens": 23804964.0, + "reward": 2.9268569946289062, + "reward_std": 0.19753608107566833, + "rewards/reward_fn/mean": 2.9268569946289062, + "rewards/reward_fn/std": 0.19753602147102356, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 260.5, + "completions/mean_terminated_length": 260.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.05696403946112231, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.0687692086212337, + "learning_rate": 7.785599999999999e-06, + "loss": 0.0028, + "num_tokens": 23861652.0, + "reward": 3.521317958831787, + "reward_std": 0.48663264513015747, + "rewards/reward_fn/mean": 3.521317958831787, + "rewards/reward_fn/std": 0.48663264513015747, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 205.65625, + "completions/mean_terminated_length": 205.65625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.05707011774689721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.03987734788097441, + "learning_rate": 7.7852e-06, + "loss": 0.0016, + "num_tokens": 23889033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 211.125, + "completions/mean_terminated_length": 211.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.057176196032672114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.04420925548765808, + "learning_rate": 7.784799999999999e-06, + "loss": 0.0018, + "num_tokens": 23942509.0, + "reward": 3.1174285411834717, + "reward_std": 0.07023604959249496, + "rewards/reward_fn/mean": 3.1174285411834717, + "rewards/reward_fn/std": 0.07023605704307556, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 396.53125, + "completions/mean_terminated_length": 396.53125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.05728227431844701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.030567975947633386, + "learning_rate": 7.7844e-06, + "loss": 0.0012, + "num_tokens": 23992286.0, + "reward": 3.864678382873535, + "reward_std": 0.4610091745853424, + "rewards/reward_fn/mean": 3.864678382873535, + "rewards/reward_fn/std": 0.46100914478302, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.05738835260422192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.038554588274564594, + "learning_rate": 7.783999999999999e-06, + "loss": 0.0015, + "num_tokens": 24036237.0, + "reward": 3.5523862838745117, + "reward_std": 0.5504615902900696, + "rewards/reward_fn/mean": 3.5523862838745117, + "rewards/reward_fn/std": 0.5504615902900696, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.05749443088999682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.050088691408745944, + "learning_rate": 7.7836e-06, + "loss": 0.002, + "num_tokens": 24083557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 428.96875, + "completions/mean_terminated_length": 428.96875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.05760050917577172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.03457546600839123, + "learning_rate": 7.7832e-06, + "loss": 0.0014, + "num_tokens": 24142052.0, + "reward": 3.9279990196228027, + "reward_std": 0.4072989225387573, + "rewards/reward_fn/mean": 3.9279990196228027, + "rewards/reward_fn/std": 0.40729889273643494, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 417.9375, + "completions/mean_terminated_length": 417.9375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.057706587461546624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.032870883820578456, + "learning_rate": 7.7828e-06, + "loss": 0.0013, + "num_tokens": 24169410.0, + "reward": 3.8196969032287598, + "reward_std": 0.4258030652999878, + "rewards/reward_fn/mean": 3.8196969032287598, + "rewards/reward_fn/std": 0.4258030652999878, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 387.71875, + "completions/mean_terminated_length": 387.71875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.05781266574732152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.039335515175480396, + "learning_rate": 7.7824e-06, + "loss": 0.0016, + "num_tokens": 24228249.0, + "reward": 2.525263786315918, + "reward_std": 0.4515567123889923, + "rewards/reward_fn/mean": 2.525263786315918, + "rewards/reward_fn/std": 0.4515567123889923, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 119.03125, + "completions/mean_terminated_length": 119.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.05791874403309642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2392578125, + "kl": 0.053942128957714885, + "learning_rate": 7.782e-06, + "loss": 0.0022, + "num_tokens": 24275610.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 98.4375, + "completions/mean_terminated_length": 98.4375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.05802482231887133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2412109375, + "kl": 0.03568551503121853, + "learning_rate": 7.7816e-06, + "loss": 0.0014, + "num_tokens": 24318056.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 430.75, + "completions/mean_terminated_length": 430.75, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.05813090060464623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.054423797759227455, + "learning_rate": 7.7812e-06, + "loss": 0.0022, + "num_tokens": 24351296.0, + "reward": 3.9609341621398926, + "reward_std": 0.2209891378879547, + "rewards/reward_fn/mean": 3.9609341621398926, + "rewards/reward_fn/std": 0.2209891378879547, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 230.28125, + "completions/mean_terminated_length": 230.28125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.05823697889042113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.05803138192277402, + "learning_rate": 7.7808e-06, + "loss": 0.0023, + "num_tokens": 24378729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 222.21875, + "completions/mean_terminated_length": 222.21875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.058343057176196034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.034262404195033014, + "learning_rate": 7.7804e-06, + "loss": 0.0014, + "num_tokens": 24421136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 291.8125, + "completions/mean_terminated_length": 291.8125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.05844913546197093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.054334802902303636, + "learning_rate": 7.78e-06, + "loss": 0.0022, + "num_tokens": 24486218.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.05855521374774584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.033949769916944206, + "learning_rate": 7.7796e-06, + "loss": 0.0014, + "num_tokens": 24523054.0, + "reward": 3.0280110836029053, + "reward_std": 0.04787445068359375, + "rewards/reward_fn/mean": 3.0280110836029053, + "rewards/reward_fn/std": 0.04787447676062584, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 317.1875, + "completions/mean_terminated_length": 317.1875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.05866129203352074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.03886931930901483, + "learning_rate": 7.7792e-06, + "loss": 0.0016, + "num_tokens": 24572404.0, + "reward": 3.0145347118377686, + "reward_std": 0.04721865430474281, + "rewards/reward_fn/mean": 3.0145347118377686, + "rewards/reward_fn/std": 0.04721866548061371, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 251.03125, + "completions/mean_terminated_length": 251.03125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.05876737031929564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.044029625481925905, + "learning_rate": 7.7788e-06, + "loss": 0.0018, + "num_tokens": 24616533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 202.03125, + "completions/mean_terminated_length": 202.03125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.058873448605070544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.06479659979231656, + "learning_rate": 7.7784e-06, + "loss": 0.0026, + "num_tokens": 24656406.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.05897952689084544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.056396145024336874, + "learning_rate": 7.777999999999999e-06, + "loss": 0.0023, + "num_tokens": 24704339.0, + "reward": 2.929412603378296, + "reward_std": 0.2831609547138214, + "rewards/reward_fn/mean": 2.929412603378296, + "rewards/reward_fn/std": 0.283160924911499, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.05908560517662034, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.04244612640468404, + "learning_rate": 7.7776e-06, + "loss": 0.0017, + "num_tokens": 24742575.0, + "reward": 3.569821834564209, + "reward_std": 0.5647028088569641, + "rewards/reward_fn/mean": 3.569821834564209, + "rewards/reward_fn/std": 0.5647028088569641, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 229.78125, + "completions/mean_terminated_length": 229.78125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.05919168346239525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0631415651878342, + "learning_rate": 7.777199999999999e-06, + "loss": 0.0025, + "num_tokens": 24788456.0, + "reward": 3.678088903427124, + "reward_std": 0.485332190990448, + "rewards/reward_fn/mean": 3.678088903427124, + "rewards/reward_fn/std": 0.4853322207927704, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.05929776174817015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.03941875655436888, + "learning_rate": 7.7768e-06, + "loss": 0.0016, + "num_tokens": 24832360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 185.96875, + "completions/mean_terminated_length": 185.96875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.059403840033945055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.05661877314560115, + "learning_rate": 7.776399999999999e-06, + "loss": 0.0023, + "num_tokens": 24873159.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 119.0, + "completions/mean_terminated_length": 119.0, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.059509918319719954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.189453125, + "kl": 0.03810377966146916, + "learning_rate": 7.776e-06, + "loss": 0.0015, + "num_tokens": 24916487.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 174.125, + "completions/mean_terminated_length": 174.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.05961599660549485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.05429914325941354, + "learning_rate": 7.775599999999999e-06, + "loss": 0.0022, + "num_tokens": 24958379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 296.3125, + "completions/mean_terminated_length": 296.3125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.05972207489126976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.04550172860035673, + "learning_rate": 7.7752e-06, + "loss": 0.0018, + "num_tokens": 24999541.0, + "reward": 3.886976957321167, + "reward_std": 0.3571126461029053, + "rewards/reward_fn/mean": 3.886976957321167, + "rewards/reward_fn/std": 0.3571126461029053, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 151.5, + "completions/mean_terminated_length": 151.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.05982815317704466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.03715689940145239, + "learning_rate": 7.774799999999999e-06, + "loss": 0.0015, + "num_tokens": 25028357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 222.40625, + "completions/mean_terminated_length": 222.40625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.05993423146281956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.04165405425010249, + "learning_rate": 7.7744e-06, + "loss": 0.0017, + "num_tokens": 25073714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 361.90625, + "completions/mean_terminated_length": 361.90625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.060040309748594464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.05564211937598884, + "learning_rate": 7.774e-06, + "loss": 0.0022, + "num_tokens": 25129679.0, + "reward": 3.9234397411346436, + "reward_std": 0.24193021655082703, + "rewards/reward_fn/mean": 3.9234397411346436, + "rewards/reward_fn/std": 0.24193023145198822, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 294.1875, + "completions/mean_terminated_length": 294.1875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.060146388034369364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.036501555470749736, + "learning_rate": 7.7736e-06, + "loss": 0.0015, + "num_tokens": 25179445.0, + "reward": 2.908540725708008, + "reward_std": 0.22343216836452484, + "rewards/reward_fn/mean": 2.908540725708008, + "rewards/reward_fn/std": 0.22343213856220245, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 301.9375, + "completions/mean_terminated_length": 301.9375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.06025246632014427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.054916275781579316, + "learning_rate": 7.7732e-06, + "loss": 0.0022, + "num_tokens": 25227891.0, + "reward": 3.2726054191589355, + "reward_std": 0.4647665023803711, + "rewards/reward_fn/mean": 3.2726054191589355, + "rewards/reward_fn/std": 0.4647665023803711, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 174.59375, + "completions/mean_terminated_length": 174.59375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.06035854460591917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.05450990307144821, + "learning_rate": 7.7728e-06, + "loss": 0.0022, + "num_tokens": 25276134.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 283.8125, + "completions/mean_terminated_length": 283.8125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.06046462289169407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.026582436956232414, + "learning_rate": 7.7724e-06, + "loss": 0.0011, + "num_tokens": 25328416.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 701.71875, + "completions/mean_terminated_length": 562.4483032226562, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.060570701177468975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0481034837430343, + "learning_rate": 7.772e-06, + "loss": 0.0019, + "num_tokens": 25390199.0, + "reward": 2.5434768199920654, + "reward_std": 0.8583298325538635, + "rewards/reward_fn/mean": 2.5434768199920654, + "rewards/reward_fn/std": 0.8583298325538635, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 558.1875, + "completions/mean_terminated_length": 558.1875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.060676779463243874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.036773104628082365, + "learning_rate": 7.7716e-06, + "loss": 0.0015, + "num_tokens": 25444829.0, + "reward": 2.9064252376556396, + "reward_std": 0.025962000712752342, + "rewards/reward_fn/mean": 2.9064252376556396, + "rewards/reward_fn/std": 0.02596198581159115, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 72.3125, + "completions/mean_terminated_length": 72.3125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.06078285774901877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.015629198125679977, + "learning_rate": 7.7712e-06, + "loss": 0.0006, + "num_tokens": 25481415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 404.09375, + "completions/mean_terminated_length": 404.09375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.06088893603479368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.03797735116677359, + "learning_rate": 7.7708e-06, + "loss": 0.0015, + "num_tokens": 25538282.0, + "reward": 3.007188320159912, + "reward_std": 0.32522156834602356, + "rewards/reward_fn/mean": 3.007188320159912, + "rewards/reward_fn/std": 0.32522156834602356, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.06099501432056858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.03846541151870042, + "learning_rate": 7.7704e-06, + "loss": 0.0015, + "num_tokens": 25579145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 139.375, + "completions/mean_terminated_length": 139.375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.06110109260634348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.034549258183687925, + "learning_rate": 7.769999999999998e-06, + "loss": 0.0014, + "num_tokens": 25626293.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 183.6875, + "completions/mean_terminated_length": 183.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.061207170892118384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.056792226212564856, + "learning_rate": 7.7696e-06, + "loss": 0.0023, + "num_tokens": 25677611.0, + "reward": 2.673492431640625, + "reward_std": 0.05745452642440796, + "rewards/reward_fn/mean": 2.673492431640625, + "rewards/reward_fn/std": 0.05745454132556915, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 250.96875, + "completions/mean_terminated_length": 250.96875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.061313249177893284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0328554887091741, + "learning_rate": 7.7692e-06, + "loss": 0.0013, + "num_tokens": 25717642.0, + "reward": 2.903965950012207, + "reward_std": 0.3777007758617401, + "rewards/reward_fn/mean": 2.903965950012207, + "rewards/reward_fn/std": 0.3777008056640625, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 135.28125, + "completions/mean_terminated_length": 135.28125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.06141932746366819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2392578125, + "kl": 0.03916344471508637, + "learning_rate": 7.7688e-06, + "loss": 0.0016, + "num_tokens": 25754707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1216.0, + "completions/max_terminated_length": 1216.0, + "completions/mean_length": 530.03125, + "completions/mean_terminated_length": 530.03125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06152540574944309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.07856652745977044, + "learning_rate": 7.7684e-06, + "loss": 0.0031, + "num_tokens": 25819956.0, + "reward": 3.016455888748169, + "reward_std": 0.26328423619270325, + "rewards/reward_fn/mean": 3.016455888748169, + "rewards/reward_fn/std": 0.26328420639038086, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 241.75, + "completions/mean_terminated_length": 241.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.06163148403521799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.04260317754233256, + "learning_rate": 7.767999999999999e-06, + "loss": 0.0017, + "num_tokens": 25859212.0, + "reward": 3.762989044189453, + "reward_std": 0.502149224281311, + "rewards/reward_fn/mean": 3.762989044189453, + "rewards/reward_fn/std": 0.5021491646766663, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 305.0, + "completions/mean_terminated_length": 305.0, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.061737562320992895, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.05852566647808999, + "learning_rate": 7.7676e-06, + "loss": 0.0023, + "num_tokens": 25897740.0, + "reward": 2.340458393096924, + "reward_std": 0.49268436431884766, + "rewards/reward_fn/mean": 2.340458393096924, + "rewards/reward_fn/std": 0.49268433451652527, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 104.40625, + "completions/mean_terminated_length": 104.40625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.061843640606767794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.03926274285186082, + "learning_rate": 7.767199999999999e-06, + "loss": 0.0016, + "num_tokens": 25931033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 290.46875, + "completions/mean_terminated_length": 290.46875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.06194971889254269, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.046176372619811445, + "learning_rate": 7.7668e-06, + "loss": 0.0018, + "num_tokens": 25981544.0, + "reward": 3.8755037784576416, + "reward_std": 0.39327624440193176, + "rewards/reward_fn/mean": 3.8755037784576416, + "rewards/reward_fn/std": 0.3932762145996094, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 223.1875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0620557971783176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.021591511933365837, + "learning_rate": 7.766399999999999e-06, + "loss": 0.0009, + "num_tokens": 26032654.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 282.6875, + "completions/mean_terminated_length": 282.6875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0621618754640925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.03477652568835765, + "learning_rate": 7.766e-06, + "loss": 0.0014, + "num_tokens": 26062148.0, + "reward": 3.8180346488952637, + "reward_std": 0.4300572872161865, + "rewards/reward_fn/mean": 3.8180346488952637, + "rewards/reward_fn/std": 0.43005725741386414, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 296.3125, + "completions/mean_terminated_length": 296.3125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.062267953749867405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.0395249537541531, + "learning_rate": 7.765599999999999e-06, + "loss": 0.0016, + "num_tokens": 26102062.0, + "reward": 2.6568655967712402, + "reward_std": 0.18853385746479034, + "rewards/reward_fn/mean": 2.6568655967712402, + "rewards/reward_fn/std": 0.18853387236595154, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 219.03125, + "completions/mean_terminated_length": 219.03125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.062374032035642304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.04510620346991345, + "learning_rate": 7.7652e-06, + "loss": 0.0018, + "num_tokens": 26147535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 163.71875, + "completions/mean_terminated_length": 163.71875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.062480110321417204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.050036760105285794, + "learning_rate": 7.7648e-06, + "loss": 0.002, + "num_tokens": 26192390.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 140.8125, + "completions/mean_terminated_length": 140.8125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.0625861886071921, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.022869454056490213, + "learning_rate": 7.7644e-06, + "loss": 0.0009, + "num_tokens": 26232288.0, + "reward": 3.9345054626464844, + "reward_std": 0.2577376961708069, + "rewards/reward_fn/mean": 3.9345054626464844, + "rewards/reward_fn/std": 0.2577377259731293, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.06269226689296702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.026837114244699478, + "learning_rate": 7.764e-06, + "loss": 0.0011, + "num_tokens": 26272428.0, + "reward": 3.941838264465332, + "reward_std": 0.22888779640197754, + "rewards/reward_fn/mean": 3.941838264465332, + "rewards/reward_fn/std": 0.22888781130313873, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 174.46875, + "completions/mean_terminated_length": 174.46875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.06279834517874192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.025287829048465937, + "learning_rate": 7.7636e-06, + "loss": 0.001, + "num_tokens": 26315963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 203.28125, + "completions/mean_terminated_length": 203.28125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.06290442346451681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.0472224080003798, + "learning_rate": 7.7632e-06, + "loss": 0.0019, + "num_tokens": 26370724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 172.59375, + "completions/mean_terminated_length": 172.59375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06301050175029171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.03370702202664688, + "learning_rate": 7.7628e-06, + "loss": 0.0013, + "num_tokens": 26414359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.06311658003606661, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.03185254998970777, + "learning_rate": 7.7624e-06, + "loss": 0.0013, + "num_tokens": 26443774.0, + "reward": 3.669276714324951, + "reward_std": 0.4985271990299225, + "rewards/reward_fn/mean": 3.669276714324951, + "rewards/reward_fn/std": 0.4985271692276001, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 326.03125, + "completions/mean_terminated_length": 326.03125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.06322265832184151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.031429233436938375, + "learning_rate": 7.762e-06, + "loss": 0.0013, + "num_tokens": 26473759.0, + "reward": 2.7664852142333984, + "reward_std": 0.09329904615879059, + "rewards/reward_fn/mean": 2.7664852142333984, + "rewards/reward_fn/std": 0.0932990238070488, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.06332873660761643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.03974258748348802, + "learning_rate": 7.761599999999999e-06, + "loss": 0.0016, + "num_tokens": 26517672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 239.5625, + "completions/mean_terminated_length": 239.5625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.06343481489339133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.018108745745848864, + "learning_rate": 7.7612e-06, + "loss": 0.0007, + "num_tokens": 26570458.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 254.78125, + "completions/mean_terminated_length": 254.78125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.06354089317916622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.030075128830503672, + "learning_rate": 7.760799999999999e-06, + "loss": 0.0012, + "num_tokens": 26614131.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 497.65625, + "completions/mean_terminated_length": 497.65625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.06364697146494112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.041799441270995885, + "learning_rate": 7.7604e-06, + "loss": 0.0017, + "num_tokens": 26679752.0, + "reward": 2.796441078186035, + "reward_std": 0.0220673605799675, + "rewards/reward_fn/mean": 2.796441078186035, + "rewards/reward_fn/std": 0.022067388519644737, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 235.3125, + "completions/mean_terminated_length": 235.3125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.06375304975071602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02781981969019398, + "learning_rate": 7.76e-06, + "loss": 0.0011, + "num_tokens": 26732018.0, + "reward": 2.8756937980651855, + "reward_std": 0.03420599177479744, + "rewards/reward_fn/mean": 2.8756937980651855, + "rewards/reward_fn/std": 0.03420599177479744, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 292.4375, + "completions/mean_terminated_length": 292.4375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.06385912803649094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.04630755807738751, + "learning_rate": 7.7596e-06, + "loss": 0.0019, + "num_tokens": 26781184.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 514.8125, + "completions/mean_terminated_length": 514.8125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.06396520632226584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0662745222216472, + "learning_rate": 7.7592e-06, + "loss": 0.0027, + "num_tokens": 26832442.0, + "reward": 3.023439884185791, + "reward_std": 0.4279276430606842, + "rewards/reward_fn/mean": 3.023439884185791, + "rewards/reward_fn/std": 0.427927702665329, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 279.0625, + "completions/mean_terminated_length": 279.0625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.06407128460804073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.03080717130796984, + "learning_rate": 7.7588e-06, + "loss": 0.0012, + "num_tokens": 26855164.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.06417736289381563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.060264868661761284, + "learning_rate": 7.7584e-06, + "loss": 0.0024, + "num_tokens": 26895832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 173.625, + "completions/mean_terminated_length": 173.625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.06428344117959053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.030933504458516836, + "learning_rate": 7.758e-06, + "loss": 0.0012, + "num_tokens": 26934828.0, + "reward": 2.9522957801818848, + "reward_std": 0.016364410519599915, + "rewards/reward_fn/mean": 2.9522957801818848, + "rewards/reward_fn/std": 0.016364362090826035, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 228.875, + "completions/mean_terminated_length": 228.875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.06438951946536543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.024772782082436606, + "learning_rate": 7.7576e-06, + "loss": 0.001, + "num_tokens": 26981256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1384.0, + "completions/max_terminated_length": 1384.0, + "completions/mean_length": 412.75, + "completions/mean_terminated_length": 412.75, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.06449559775114035, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "kl": 0.08590105758048594, + "learning_rate": 7.7572e-06, + "loss": 0.0034, + "num_tokens": 27028832.0, + "reward": 3.3276102542877197, + "reward_std": 0.4957602918148041, + "rewards/reward_fn/mean": 3.3276102542877197, + "rewards/reward_fn/std": 0.49576032161712646, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 252.8125, + "completions/mean_terminated_length": 252.8125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.06460167603691525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.03629728313535452, + "learning_rate": 7.7568e-06, + "loss": 0.0015, + "num_tokens": 27066938.0, + "reward": 3.7472081184387207, + "reward_std": 0.4863166809082031, + "rewards/reward_fn/mean": 3.7472081184387207, + "rewards/reward_fn/std": 0.48631665110588074, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 227.8125, + "completions/mean_terminated_length": 227.8125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.06470775432269014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.024877535848645493, + "learning_rate": 7.756399999999999e-06, + "loss": 0.001, + "num_tokens": 27121588.0, + "reward": 1.737754464149475, + "reward_std": 0.02460699900984764, + "rewards/reward_fn/mean": 1.737754464149475, + "rewards/reward_fn/std": 0.02460700459778309, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 243.65625, + "completions/mean_terminated_length": 243.65625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06481383260846504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.0540953372255899, + "learning_rate": 7.756e-06, + "loss": 0.0022, + "num_tokens": 27163785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 342.71875, + "completions/mean_terminated_length": 342.71875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06491991089423994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.05290446960134432, + "learning_rate": 7.7556e-06, + "loss": 0.0021, + "num_tokens": 27207776.0, + "reward": 3.002070903778076, + "reward_std": 0.05182076618075371, + "rewards/reward_fn/mean": 3.002070903778076, + "rewards/reward_fn/std": 0.051820818334817886, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 407.4375, + "completions/mean_terminated_length": 407.4375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.06502598918001486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.07026238366961479, + "learning_rate": 7.7552e-06, + "loss": 0.0028, + "num_tokens": 27259982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 292.0625, + "completions/mean_terminated_length": 292.0625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.06513206746578976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.05310130078578368, + "learning_rate": 7.7548e-06, + "loss": 0.0021, + "num_tokens": 27315440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 221.6875, + "completions/mean_terminated_length": 221.6875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.06523814575156466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.05479395019938238, + "learning_rate": 7.7544e-06, + "loss": 0.0022, + "num_tokens": 27351750.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 244.5625, + "completions/mean_terminated_length": 244.5625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.06534422403733955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.0540774236433208, + "learning_rate": 7.753999999999999e-06, + "loss": 0.0022, + "num_tokens": 27398040.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 231.6875, + "completions/mean_terminated_length": 231.6875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.06545030232311445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.041808648442383856, + "learning_rate": 7.7536e-06, + "loss": 0.0017, + "num_tokens": 27457646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1973.0, + "completions/mean_length": 1549.90625, + "completions/mean_terminated_length": 1209.105224609375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.06555638060888937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.04176906222710386, + "learning_rate": 7.753199999999999e-06, + "loss": 0.0017, + "num_tokens": 27542891.0, + "reward": 1.575758934020996, + "reward_std": 1.2176213264465332, + "rewards/reward_fn/mean": 1.575758934020996, + "rewards/reward_fn/std": 1.2176213264465332, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1917.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 389.1875, + "completions/mean_terminated_length": 389.1875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06566245889466427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.059505374170839787, + "learning_rate": 7.7528e-06, + "loss": 0.0024, + "num_tokens": 27574737.0, + "reward": 3.0435049533843994, + "reward_std": 0.057892631739377975, + "rewards/reward_fn/mean": 3.0435049533843994, + "rewards/reward_fn/std": 0.05789259821176529, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 348.15625, + "completions/mean_terminated_length": 348.15625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.06576853718043917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.07023045897949487, + "learning_rate": 7.752399999999999e-06, + "loss": 0.0028, + "num_tokens": 27615606.0, + "reward": 3.835360050201416, + "reward_std": 0.3887580633163452, + "rewards/reward_fn/mean": 3.835360050201416, + "rewards/reward_fn/std": 0.3887580633163452, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 335.5, + "completions/mean_terminated_length": 335.5, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.06587461546621406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.05529448727611452, + "learning_rate": 7.752e-06, + "loss": 0.0022, + "num_tokens": 27672454.0, + "reward": 2.523176670074463, + "reward_std": 1.0861042737960815, + "rewards/reward_fn/mean": 2.523176670074463, + "rewards/reward_fn/std": 1.0861042737960815, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 435.96875, + "completions/mean_terminated_length": 435.96875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.06598069375198896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.05438588681863621, + "learning_rate": 7.751599999999999e-06, + "loss": 0.0022, + "num_tokens": 27726629.0, + "reward": 3.077277421951294, + "reward_std": 0.45385316014289856, + "rewards/reward_fn/mean": 3.077277421951294, + "rewards/reward_fn/std": 0.45385313034057617, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1613.0, + "completions/mean_length": 572.25, + "completions/mean_terminated_length": 524.6451416015625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.06608677203776386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.06977469654520974, + "learning_rate": 7.7512e-06, + "loss": 0.0028, + "num_tokens": 27781901.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 238.21875, + "completions/mean_terminated_length": 238.21875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.06619285032353878, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.05847651744261384, + "learning_rate": 7.7508e-06, + "loss": 0.0023, + "num_tokens": 27835924.0, + "reward": 3.654426336288452, + "reward_std": 0.3738429844379425, + "rewards/reward_fn/mean": 3.654426336288452, + "rewards/reward_fn/std": 0.3738429844379425, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 512.5625, + "completions/mean_terminated_length": 463.0322570800781, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.06629892860931368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.06687481026165187, + "learning_rate": 7.7504e-06, + "loss": 0.0027, + "num_tokens": 27882566.0, + "reward": 2.687096357345581, + "reward_std": 0.513810932636261, + "rewards/reward_fn/mean": 2.687096357345581, + "rewards/reward_fn/std": 0.5138109922409058, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 325.40625, + "completions/mean_terminated_length": 325.40625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06640500689508858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04519660054938868, + "learning_rate": 7.75e-06, + "loss": 0.0018, + "num_tokens": 27915059.0, + "reward": 3.95910382270813, + "reward_std": 0.23134386539459229, + "rewards/reward_fn/mean": 3.95910382270813, + "rewards/reward_fn/std": 0.23134388029575348, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1281.0, + "completions/max_terminated_length": 1281.0, + "completions/mean_length": 377.1875, + "completions/mean_terminated_length": 377.1875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.06651108518086347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.051734329434111714, + "learning_rate": 7.7496e-06, + "loss": 0.0021, + "num_tokens": 27972953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 169.21875, + "completions/mean_terminated_length": 169.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.06661716346663837, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.359375, + "kl": 0.046305317111546174, + "learning_rate": 7.7492e-06, + "loss": 0.0019, + "num_tokens": 28016192.0, + "reward": 3.564767837524414, + "reward_std": 0.5709716081619263, + "rewards/reward_fn/mean": 3.564767837524414, + "rewards/reward_fn/std": 0.5709716081619263, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 299.875, + "completions/mean_terminated_length": 299.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.06672324175241329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.05322365625761449, + "learning_rate": 7.7488e-06, + "loss": 0.0021, + "num_tokens": 28064860.0, + "reward": 2.887842893600464, + "reward_std": 0.03686607629060745, + "rewards/reward_fn/mean": 2.887842893600464, + "rewards/reward_fn/std": 0.036866072565317154, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 111.1875, + "completions/mean_terminated_length": 111.1875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.06682932003818819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25, + "kl": 0.040750893735093996, + "learning_rate": 7.7484e-06, + "loss": 0.0016, + "num_tokens": 28108034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 157.78125, + "completions/mean_terminated_length": 157.78125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.06693539832396309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.029879302775952965, + "learning_rate": 7.748e-06, + "loss": 0.0012, + "num_tokens": 28145371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.06704147660973798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.03904081700602546, + "learning_rate": 7.7476e-06, + "loss": 0.0016, + "num_tokens": 28196857.0, + "reward": 2.8503220081329346, + "reward_std": 0.020154688507318497, + "rewards/reward_fn/mean": 2.8503220081329346, + "rewards/reward_fn/std": 0.02015470154583454, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 227.28125, + "completions/mean_terminated_length": 227.28125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.06714755489551288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.05340335750952363, + "learning_rate": 7.7472e-06, + "loss": 0.0021, + "num_tokens": 28235810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 151.9375, + "completions/mean_terminated_length": 151.9375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.06725363318128778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "kl": 0.046266791701782495, + "learning_rate": 7.7468e-06, + "loss": 0.0019, + "num_tokens": 28288096.0, + "reward": 2.819547653198242, + "reward_std": 0.035231560468673706, + "rewards/reward_fn/mean": 2.819547653198242, + "rewards/reward_fn/std": 0.03523159399628639, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 149.90625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.0673597114670627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.02690366155002266, + "learning_rate": 7.7464e-06, + "loss": 0.0011, + "num_tokens": 28347933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 226.59375, + "completions/mean_terminated_length": 226.59375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.0674657897528376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.047625879873521626, + "learning_rate": 7.746e-06, + "loss": 0.0019, + "num_tokens": 28397328.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 232.625, + "completions/mean_terminated_length": 232.625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.0675718680386125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.054383948212489486, + "learning_rate": 7.7456e-06, + "loss": 0.0022, + "num_tokens": 28443652.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.0676779463243874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.044864116644021124, + "learning_rate": 7.7452e-06, + "loss": 0.0018, + "num_tokens": 28477256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 143.3125, + "completions/mean_terminated_length": 143.3125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.0677840246101623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.03261691506486386, + "learning_rate": 7.744799999999999e-06, + "loss": 0.0013, + "num_tokens": 28518738.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 139.09375, + "completions/mean_terminated_length": 139.09375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.06789010289593721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.04600106261204928, + "learning_rate": 7.7444e-06, + "loss": 0.0018, + "num_tokens": 28566805.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 153.59375, + "completions/mean_terminated_length": 153.59375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.0679961811817121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.05108794302213937, + "learning_rate": 7.743999999999999e-06, + "loss": 0.002, + "num_tokens": 28627016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 365.1875, + "completions/mean_terminated_length": 365.1875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.068102259467487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.02875410852720961, + "learning_rate": 7.7436e-06, + "loss": 0.0011, + "num_tokens": 28676878.0, + "reward": 2.87377667427063, + "reward_std": 0.03773174062371254, + "rewards/reward_fn/mean": 2.87377667427063, + "rewards/reward_fn/std": 0.03773171827197075, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 715.3125, + "completions/mean_terminated_length": 715.3125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.0682083377532619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.06581689266022295, + "learning_rate": 7.743199999999999e-06, + "loss": 0.0026, + "num_tokens": 28736984.0, + "reward": 2.59894061088562, + "reward_std": 0.3901492953300476, + "rewards/reward_fn/mean": 2.59894061088562, + "rewards/reward_fn/std": 0.3901492655277252, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 105.78125, + "completions/mean_terminated_length": 105.78125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.0683144160390368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.03504444262944162, + "learning_rate": 7.7428e-06, + "loss": 0.0014, + "num_tokens": 28773137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 179.84375, + "completions/mean_terminated_length": 179.84375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.06842049432481172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.04245975334197283, + "learning_rate": 7.742399999999999e-06, + "loss": 0.0017, + "num_tokens": 28818060.0, + "reward": 3.962952136993408, + "reward_std": 0.20957522094249725, + "rewards/reward_fn/mean": 3.962952136993408, + "rewards/reward_fn/std": 0.20957525074481964, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 336.9375, + "completions/mean_terminated_length": 336.9375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.06852657261058662, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.0532979853451252, + "learning_rate": 7.742e-06, + "loss": 0.0021, + "num_tokens": 28857322.0, + "reward": 3.685168743133545, + "reward_std": 0.5120775699615479, + "rewards/reward_fn/mean": 3.685168743133545, + "rewards/reward_fn/std": 0.5120775699615479, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 142.03125, + "completions/mean_terminated_length": 142.03125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.06863265089636152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.023056287580402568, + "learning_rate": 7.741599999999999e-06, + "loss": 0.0009, + "num_tokens": 28894699.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 226.34375, + "completions/mean_terminated_length": 226.34375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.06873872918213642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.04996389290317893, + "learning_rate": 7.7412e-06, + "loss": 0.002, + "num_tokens": 28938166.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 266.90625, + "completions/mean_terminated_length": 266.90625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.06884480746791131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.03471590060507879, + "learning_rate": 7.7408e-06, + "loss": 0.0014, + "num_tokens": 28994931.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 131.625, + "completions/mean_terminated_length": 131.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.06895088575368621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.031769126711878926, + "learning_rate": 7.7404e-06, + "loss": 0.0013, + "num_tokens": 29029703.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 86.0, + "completions/mean_terminated_length": 86.0, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.06905696403946113, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.041167730116285384, + "learning_rate": 7.74e-06, + "loss": 0.0017, + "num_tokens": 29050727.0, + "reward": 3.076449155807495, + "reward_std": 0.009099267423152924, + "rewards/reward_fn/mean": 3.076449155807495, + "rewards/reward_fn/std": 0.009099281392991543, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 264.125, + "completions/mean_terminated_length": 264.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.06916304232523603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.045598643948324025, + "learning_rate": 7.7396e-06, + "loss": 0.0018, + "num_tokens": 29114539.0, + "reward": 2.8244357109069824, + "reward_std": 0.03792543336749077, + "rewards/reward_fn/mean": 2.8244357109069824, + "rewards/reward_fn/std": 0.03792539983987808, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 229.1875, + "completions/mean_terminated_length": 229.1875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.06926912061101093, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.043347687751520425, + "learning_rate": 7.7392e-06, + "loss": 0.0017, + "num_tokens": 29152081.0, + "reward": 3.718015670776367, + "reward_std": 0.42512795329093933, + "rewards/reward_fn/mean": 3.718015670776367, + "rewards/reward_fn/std": 0.4251279830932617, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06937519889678583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.0510714779375121, + "learning_rate": 7.7388e-06, + "loss": 0.002, + "num_tokens": 29194725.0, + "reward": 3.521083354949951, + "reward_std": 0.7184927463531494, + "rewards/reward_fn/mean": 3.521083354949951, + "rewards/reward_fn/std": 0.7184926867485046, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 318.0, + "completions/mean_terminated_length": 318.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.06948127718256072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.05452185485046357, + "learning_rate": 7.7384e-06, + "loss": 0.0022, + "num_tokens": 29241221.0, + "reward": 3.9632177352905273, + "reward_std": 0.2080727219581604, + "rewards/reward_fn/mean": 3.9632177352905273, + "rewards/reward_fn/std": 0.2080727070569992, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 131.0625, + "completions/mean_terminated_length": 131.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.06958735546833564, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "kl": 0.0322262124682311, + "learning_rate": 7.738e-06, + "loss": 0.0013, + "num_tokens": 29278951.0, + "reward": 2.871333599090576, + "reward_std": 0.014810092747211456, + "rewards/reward_fn/mean": 2.871333599090576, + "rewards/reward_fn/std": 0.01481009740382433, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 212.3125, + "completions/mean_terminated_length": 212.3125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.06969343375411054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.04696744668763131, + "learning_rate": 7.737599999999999e-06, + "loss": 0.0019, + "num_tokens": 29318705.0, + "reward": 2.895874500274658, + "reward_std": 0.022386854514479637, + "rewards/reward_fn/mean": 2.895874500274658, + "rewards/reward_fn/std": 0.022386867552995682, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 265.46875, + "completions/mean_terminated_length": 265.46875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.06979951203988544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.02159346017288044, + "learning_rate": 7.7372e-06, + "loss": 0.0009, + "num_tokens": 29347392.0, + "reward": 2.750436305999756, + "reward_std": 0.03207426145672798, + "rewards/reward_fn/mean": 2.750436305999756, + "rewards/reward_fn/std": 0.032074298709630966, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 184.375, + "completions/mean_terminated_length": 184.375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.06990559032566034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.04227348789572716, + "learning_rate": 7.736799999999998e-06, + "loss": 0.0017, + "num_tokens": 29370412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 291.28125, + "completions/mean_terminated_length": 291.28125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.07001166861143523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.032787035626824945, + "learning_rate": 7.7364e-06, + "loss": 0.0013, + "num_tokens": 29416629.0, + "reward": 3.9628915786743164, + "reward_std": 0.20991654694080353, + "rewards/reward_fn/mean": 3.9628915786743164, + "rewards/reward_fn/std": 0.20991650223731995, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 150.71875, + "completions/mean_terminated_length": 150.71875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.07011774689721013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.05282619083300233, + "learning_rate": 7.736e-06, + "loss": 0.0021, + "num_tokens": 29451724.0, + "reward": 2.8188912868499756, + "reward_std": 0.01984592340886593, + "rewards/reward_fn/mean": 2.8188912868499756, + "rewards/reward_fn/std": 0.01984594017267227, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 85.78125, + "completions/mean_terminated_length": 85.78125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.07022382518298505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.023483653378207237, + "learning_rate": 7.7356e-06, + "loss": 0.0009, + "num_tokens": 29482821.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 96.3125, + "completions/mean_terminated_length": 96.3125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.07032990346875995, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.04963029740611091, + "learning_rate": 7.7352e-06, + "loss": 0.002, + "num_tokens": 29530031.0, + "reward": 3.210749626159668, + "reward_std": 0.009940498508512974, + "rewards/reward_fn/mean": 3.210749626159668, + "rewards/reward_fn/std": 0.009940499439835548, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 191.4375, + "completions/mean_terminated_length": 191.4375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.07043598175453485, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.05990648630540818, + "learning_rate": 7.7348e-06, + "loss": 0.0024, + "num_tokens": 29554973.0, + "reward": 3.537442207336426, + "reward_std": 0.6495077013969421, + "rewards/reward_fn/mean": 3.537442207336426, + "rewards/reward_fn/std": 0.6495076417922974, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 142.03125, + "completions/mean_terminated_length": 142.03125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.07054206004030975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.0415572501369752, + "learning_rate": 7.7344e-06, + "loss": 0.0017, + "num_tokens": 29608254.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 207.09375, + "completions/mean_terminated_length": 207.09375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.07064813832608464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.03682469832710922, + "learning_rate": 7.733999999999999e-06, + "loss": 0.0015, + "num_tokens": 29655361.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.07075421661185956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0421582740964368, + "learning_rate": 7.7336e-06, + "loss": 0.0017, + "num_tokens": 29678721.0, + "reward": 3.49739933013916, + "reward_std": 0.45667457580566406, + "rewards/reward_fn/mean": 3.49739933013916, + "rewards/reward_fn/std": 0.45667460560798645, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 133.1875, + "completions/mean_terminated_length": 133.1875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.07086029489763446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.0935792492236942, + "learning_rate": 7.733199999999999e-06, + "loss": 0.0037, + "num_tokens": 29700839.0, + "reward": 2.8479533195495605, + "reward_std": 0.30300331115722656, + "rewards/reward_fn/mean": 2.8479533195495605, + "rewards/reward_fn/std": 0.3030032813549042, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 351.03125, + "completions/mean_terminated_length": 351.03125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.07096637318340936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06572336354292929, + "learning_rate": 7.7328e-06, + "loss": 0.0026, + "num_tokens": 29755400.0, + "reward": 3.5044994354248047, + "reward_std": 0.5040919184684753, + "rewards/reward_fn/mean": 3.5044994354248047, + "rewards/reward_fn/std": 0.5040919184684753, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 167.46875, + "completions/mean_terminated_length": 167.46875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.07107245146918426, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.05414850829401985, + "learning_rate": 7.732399999999999e-06, + "loss": 0.0022, + "num_tokens": 29807191.0, + "reward": 3.538360118865967, + "reward_std": 0.5391759276390076, + "rewards/reward_fn/mean": 3.538360118865967, + "rewards/reward_fn/std": 0.5391759276390076, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 133.21875, + "completions/mean_terminated_length": 133.21875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.07117852975495916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.1002915867138654, + "learning_rate": 7.732e-06, + "loss": 0.004, + "num_tokens": 29841758.0, + "reward": 2.857252359390259, + "reward_std": 0.05256405472755432, + "rewards/reward_fn/mean": 2.857252359390259, + "rewards/reward_fn/std": 0.05256406217813492, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 627.9375, + "completions/mean_terminated_length": 627.9375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.07128460804073407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.0414414910483174, + "learning_rate": 7.7316e-06, + "loss": 0.0017, + "num_tokens": 29897244.0, + "reward": 2.651923656463623, + "reward_std": 0.30288705229759216, + "rewards/reward_fn/mean": 2.651923656463623, + "rewards/reward_fn/std": 0.3028870224952698, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 243.25, + "completions/mean_terminated_length": 243.25, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07139068632650897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.06160746526438743, + "learning_rate": 7.7312e-06, + "loss": 0.0025, + "num_tokens": 29937188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 190.0625, + "completions/mean_terminated_length": 190.0625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.07149676461228387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.043393855332396924, + "learning_rate": 7.7308e-06, + "loss": 0.0017, + "num_tokens": 29983494.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 206.09375, + "completions/mean_terminated_length": 206.09375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.07160284289805877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.05433402652852237, + "learning_rate": 7.7304e-06, + "loss": 0.0022, + "num_tokens": 30029865.0, + "reward": 3.931962013244629, + "reward_std": 0.3848804235458374, + "rewards/reward_fn/mean": 3.931962013244629, + "rewards/reward_fn/std": 0.3848804533481598, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 220.5625, + "completions/mean_terminated_length": 220.5625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.07170892118383367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.028526412788778543, + "learning_rate": 7.73e-06, + "loss": 0.0011, + "num_tokens": 30070107.0, + "reward": 2.9268527030944824, + "reward_std": 0.03191540390253067, + "rewards/reward_fn/mean": 2.9268527030944824, + "rewards/reward_fn/std": 0.03191535919904709, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 342.78125, + "completions/mean_terminated_length": 342.78125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.07181499946960856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.05647675262298435, + "learning_rate": 7.7296e-06, + "loss": 0.0023, + "num_tokens": 30117972.0, + "reward": 2.998821258544922, + "reward_std": 0.23165473341941833, + "rewards/reward_fn/mean": 2.998821258544922, + "rewards/reward_fn/std": 0.23165474832057953, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 211.78125, + "completions/mean_terminated_length": 211.78125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.07192107775538348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.04946940788067877, + "learning_rate": 7.729199999999999e-06, + "loss": 0.002, + "num_tokens": 30154189.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 167.625, + "completions/mean_terminated_length": 167.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.07202715604115838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.03135518968338147, + "learning_rate": 7.7288e-06, + "loss": 0.0013, + "num_tokens": 30210497.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 220.0625, + "completions/mean_terminated_length": 220.0625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.07213323432693328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.027862557559274137, + "learning_rate": 7.728399999999999e-06, + "loss": 0.0011, + "num_tokens": 30259875.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 206.0, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.07223931261270818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.051036402059253305, + "learning_rate": 7.728e-06, + "loss": 0.002, + "num_tokens": 30320515.0, + "reward": 2.9141030311584473, + "reward_std": 0.038106195628643036, + "rewards/reward_fn/mean": 2.9141030311584473, + "rewards/reward_fn/std": 0.03810620680451393, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 159.53125, + "completions/mean_terminated_length": 159.53125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.07234539089848308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.032492958940565586, + "learning_rate": 7.727599999999999e-06, + "loss": 0.0013, + "num_tokens": 30374452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 113.15625, + "completions/mean_terminated_length": 113.15625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.07245146918425799, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.048680256586521864, + "learning_rate": 7.7272e-06, + "loss": 0.0019, + "num_tokens": 30412665.0, + "reward": 3.208629608154297, + "reward_std": 0.022559581324458122, + "rewards/reward_fn/mean": 3.208629608154297, + "rewards/reward_fn/std": 0.022559557110071182, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 180.65625, + "completions/mean_terminated_length": 180.65625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.07255754747003289, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.06957442464772612, + "learning_rate": 7.7268e-06, + "loss": 0.0028, + "num_tokens": 30458638.0, + "reward": 2.8663017749786377, + "reward_std": 0.034207750111818314, + "rewards/reward_fn/mean": 2.8663017749786377, + "rewards/reward_fn/std": 0.034207772463560104, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 336.78125, + "completions/mean_terminated_length": 336.78125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.07266362575580779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.057023753644898534, + "learning_rate": 7.7264e-06, + "loss": 0.0023, + "num_tokens": 30518407.0, + "reward": 3.330765962600708, + "reward_std": 0.5284169316291809, + "rewards/reward_fn/mean": 3.330765962600708, + "rewards/reward_fn/std": 0.5284168720245361, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 537.96875, + "completions/mean_terminated_length": 537.96875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.07276970404158269, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04859426280017942, + "learning_rate": 7.726e-06, + "loss": 0.0019, + "num_tokens": 30574982.0, + "reward": 3.4425101280212402, + "reward_std": 0.5336366891860962, + "rewards/reward_fn/mean": 3.4425101280212402, + "rewards/reward_fn/std": 0.5336366295814514, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 158.15625, + "completions/mean_terminated_length": 158.15625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.07287578232735759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.03376417566323653, + "learning_rate": 7.7256e-06, + "loss": 0.0014, + "num_tokens": 30608203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 183.59375, + "completions/mean_terminated_length": 183.59375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.07298186061313248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.06158266309648752, + "learning_rate": 7.7252e-06, + "loss": 0.0025, + "num_tokens": 30652990.0, + "reward": 3.9356517791748047, + "reward_std": 0.2532157003879547, + "rewards/reward_fn/mean": 3.9356517791748047, + "rewards/reward_fn/std": 0.2532157301902771, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.0730879388989074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.07138905744068325, + "learning_rate": 7.7248e-06, + "loss": 0.0029, + "num_tokens": 30705058.0, + "reward": 3.8710503578186035, + "reward_std": 0.3467211425304413, + "rewards/reward_fn/mean": 3.8710503578186035, + "rewards/reward_fn/std": 0.3467211425304413, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 208.1875, + "completions/mean_terminated_length": 208.1875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.0731940171846823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.03632815106539056, + "learning_rate": 7.7244e-06, + "loss": 0.0015, + "num_tokens": 30759784.0, + "reward": 3.0944759845733643, + "reward_std": 0.016841473057866096, + "rewards/reward_fn/mean": 3.0944759845733643, + "rewards/reward_fn/std": 0.016841456294059753, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 335.9375, + "completions/mean_terminated_length": 335.9375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.0733000954704572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.06270804395899177, + "learning_rate": 7.724e-06, + "loss": 0.0025, + "num_tokens": 30803366.0, + "reward": 3.3440566062927246, + "reward_std": 0.5181226134300232, + "rewards/reward_fn/mean": 3.3440566062927246, + "rewards/reward_fn/std": 0.5181225538253784, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 251.875, + "completions/mean_terminated_length": 251.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.0734061737562321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.056056828470900655, + "learning_rate": 7.7236e-06, + "loss": 0.0022, + "num_tokens": 30832258.0, + "reward": 3.3849637508392334, + "reward_std": 0.8223517537117004, + "rewards/reward_fn/mean": 3.3849637508392334, + "rewards/reward_fn/std": 0.8223517537117004, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 340.09375, + "completions/mean_terminated_length": 340.09375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.073512252042007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.05376436864025891, + "learning_rate": 7.7232e-06, + "loss": 0.0022, + "num_tokens": 30864485.0, + "reward": 3.321343421936035, + "reward_std": 0.608305811882019, + "rewards/reward_fn/mean": 3.321343421936035, + "rewards/reward_fn/std": 0.6083057522773743, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 309.40625, + "completions/mean_terminated_length": 309.40625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.07361833032778191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.041985310381278396, + "learning_rate": 7.7228e-06, + "loss": 0.0017, + "num_tokens": 30887570.0, + "reward": 2.8665213584899902, + "reward_std": 0.019187498837709427, + "rewards/reward_fn/mean": 2.8665213584899902, + "rewards/reward_fn/std": 0.019187474623322487, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 185.34375, + "completions/mean_terminated_length": 185.34375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.07372440861355681, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.546875, + "kl": 0.05925154301803559, + "learning_rate": 7.7224e-06, + "loss": 0.0024, + "num_tokens": 30929981.0, + "reward": 3.384206771850586, + "reward_std": 0.5540663599967957, + "rewards/reward_fn/mean": 3.384206771850586, + "rewards/reward_fn/std": 0.5540663003921509, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 149.5, + "completions/mean_terminated_length": 149.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.07383048689933171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.0262971400807146, + "learning_rate": 7.722e-06, + "loss": 0.0011, + "num_tokens": 30984845.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 195.75, + "completions/mean_terminated_length": 195.75, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.0739365651851066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.06998066313099116, + "learning_rate": 7.721599999999999e-06, + "loss": 0.0028, + "num_tokens": 31011685.0, + "reward": 2.9877424240112305, + "reward_std": 0.3321399986743927, + "rewards/reward_fn/mean": 2.9877424240112305, + "rewards/reward_fn/std": 0.3321399986743927, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 150.65625, + "completions/mean_terminated_length": 150.65625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.0740426434708815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.04288387484848499, + "learning_rate": 7.7212e-06, + "loss": 0.0017, + "num_tokens": 31061082.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 171.75, + "completions/mean_terminated_length": 171.75, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.07414872175665642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.047392503125593066, + "learning_rate": 7.720799999999999e-06, + "loss": 0.0019, + "num_tokens": 31085266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 217.84375, + "completions/mean_terminated_length": 217.84375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07425480004243132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.04449020640458912, + "learning_rate": 7.7204e-06, + "loss": 0.0018, + "num_tokens": 31135949.0, + "reward": 2.708951473236084, + "reward_std": 0.029957668855786324, + "rewards/reward_fn/mean": 2.708951473236084, + "rewards/reward_fn/std": 0.029957666993141174, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 94.34375, + "completions/mean_terminated_length": 94.34375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.07436087832820622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.05266049993224442, + "learning_rate": 7.719999999999999e-06, + "loss": 0.0021, + "num_tokens": 31176536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 132.46875, + "completions/mean_terminated_length": 132.46875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.07446695661398112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.04129998397547752, + "learning_rate": 7.7196e-06, + "loss": 0.0017, + "num_tokens": 31212647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 120.78125, + "completions/mean_terminated_length": 120.78125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.07457303489975602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.04240784316789359, + "learning_rate": 7.719199999999999e-06, + "loss": 0.0017, + "num_tokens": 31252096.0, + "reward": 2.8910398483276367, + "reward_std": 0.03602422773838043, + "rewards/reward_fn/mean": 2.8910398483276367, + "rewards/reward_fn/std": 0.036024242639541626, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 436.875, + "completions/mean_terminated_length": 436.875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.07467911318553092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.059532526414841413, + "learning_rate": 7.7188e-06, + "loss": 0.0024, + "num_tokens": 31302492.0, + "reward": 3.1270198822021484, + "reward_std": 0.8769363164901733, + "rewards/reward_fn/mean": 3.1270198822021484, + "rewards/reward_fn/std": 0.8769363164901733, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 262.625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.07478519147130583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.04183370858663693, + "learning_rate": 7.718399999999999e-06, + "loss": 0.0017, + "num_tokens": 31344848.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 183.875, + "completions/mean_terminated_length": 183.875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.07489126975708073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.044983384606894106, + "learning_rate": 7.718e-06, + "loss": 0.0018, + "num_tokens": 31384556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 171.9375, + "completions/mean_terminated_length": 171.9375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.07499734804285563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.05026729096425697, + "learning_rate": 7.7176e-06, + "loss": 0.002, + "num_tokens": 31441322.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 169.9375, + "completions/mean_terminated_length": 169.9375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.07510342632863053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.036427939659915864, + "learning_rate": 7.7172e-06, + "loss": 0.0015, + "num_tokens": 31478984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 136.125, + "completions/mean_terminated_length": 136.125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.07520950461440543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.05036911822389811, + "learning_rate": 7.7168e-06, + "loss": 0.002, + "num_tokens": 31513708.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 177.96875, + "completions/mean_terminated_length": 177.96875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.07531558290018034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.06007158639840782, + "learning_rate": 7.7164e-06, + "loss": 0.0024, + "num_tokens": 31555595.0, + "reward": 3.960479736328125, + "reward_std": 0.15552453696727753, + "rewards/reward_fn/mean": 3.960479736328125, + "rewards/reward_fn/std": 0.15552456676959991, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.07542166118595524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.07482221210375428, + "learning_rate": 7.716e-06, + "loss": 0.003, + "num_tokens": 31601640.0, + "reward": 3.256192445755005, + "reward_std": 0.5825570225715637, + "rewards/reward_fn/mean": 3.256192445755005, + "rewards/reward_fn/std": 0.5825570225715637, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 182.46875, + "completions/mean_terminated_length": 182.46875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.07552773947173014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.038130709261167794, + "learning_rate": 7.7156e-06, + "loss": 0.0015, + "num_tokens": 31651319.0, + "reward": 3.974000930786133, + "reward_std": 0.14707225561141968, + "rewards/reward_fn/mean": 3.974000930786133, + "rewards/reward_fn/std": 0.1470722258090973, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 180.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.07563381775750504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.057246467215009034, + "learning_rate": 7.7152e-06, + "loss": 0.0023, + "num_tokens": 31680747.0, + "reward": 3.156899929046631, + "reward_std": 0.5360119342803955, + "rewards/reward_fn/mean": 3.156899929046631, + "rewards/reward_fn/std": 0.5360119342803955, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 176.75, + "completions/mean_terminated_length": 176.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.07573989604327994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.032892783579882234, + "learning_rate": 7.7148e-06, + "loss": 0.0013, + "num_tokens": 31719331.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 139.5, + "completions/mean_terminated_length": 139.5, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.07584597432905485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.02656198089243844, + "learning_rate": 7.7144e-06, + "loss": 0.0011, + "num_tokens": 31745331.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.07595205261482975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.036677059542853385, + "learning_rate": 7.714e-06, + "loss": 0.0015, + "num_tokens": 31782683.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.07605813090060465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.046608570439275354, + "learning_rate": 7.713599999999998e-06, + "loss": 0.0019, + "num_tokens": 31828976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 238.84375, + "completions/mean_terminated_length": 238.84375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.07616420918637955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3515625, + "kl": 0.06297203176654875, + "learning_rate": 7.7132e-06, + "loss": 0.0025, + "num_tokens": 31879499.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 283.40625, + "completions/mean_terminated_length": 283.40625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.07627028747215445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.0723530335817486, + "learning_rate": 7.7128e-06, + "loss": 0.0029, + "num_tokens": 31919480.0, + "reward": 2.887519359588623, + "reward_std": 0.4284627139568329, + "rewards/reward_fn/mean": 2.887519359588623, + "rewards/reward_fn/std": 0.4284627139568329, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 70.34375, + "completions/mean_terminated_length": 70.34375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.07637636575792935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.03475224046269432, + "learning_rate": 7.7124e-06, + "loss": 0.0014, + "num_tokens": 31959587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 312.59375, + "completions/mean_terminated_length": 312.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.07648244404370426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.0463191430317238, + "learning_rate": 7.712e-06, + "loss": 0.0019, + "num_tokens": 32017942.0, + "reward": 3.9643290042877197, + "reward_std": 0.20178602635860443, + "rewards/reward_fn/mean": 3.9643290042877197, + "rewards/reward_fn/std": 0.20178604125976562, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 295.0625, + "completions/mean_terminated_length": 295.0625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.07658852232947916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.04168545902939513, + "learning_rate": 7.711599999999999e-06, + "loss": 0.0017, + "num_tokens": 32067992.0, + "reward": 2.7265396118164062, + "reward_std": 0.033310580998659134, + "rewards/reward_fn/mean": 2.7265396118164062, + "rewards/reward_fn/std": 0.03331058472394943, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 258.5, + "completions/mean_terminated_length": 258.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.07669460061525406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.029477198084350675, + "learning_rate": 7.7112e-06, + "loss": 0.0012, + "num_tokens": 32112904.0, + "reward": 1.7350808382034302, + "reward_std": 0.022046852856874466, + "rewards/reward_fn/mean": 1.7350808382034302, + "rewards/reward_fn/std": 0.022046852856874466, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.07680067890102896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.04448731255251914, + "learning_rate": 7.710799999999999e-06, + "loss": 0.0018, + "num_tokens": 32149416.0, + "reward": 2.949578285217285, + "reward_std": 0.004209110513329506, + "rewards/reward_fn/mean": 2.949578285217285, + "rewards/reward_fn/std": 0.00420913752168417, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 343.71875, + "completions/mean_terminated_length": 343.71875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.07690675718680386, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3125, + "kl": 0.07948042592033744, + "learning_rate": 7.7104e-06, + "loss": 0.0032, + "num_tokens": 32209151.0, + "reward": 2.8511264324188232, + "reward_std": 0.054016876965761185, + "rewards/reward_fn/mean": 2.8511264324188232, + "rewards/reward_fn/std": 0.054016903042793274, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 248.09375, + "completions/mean_terminated_length": 248.09375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.07701283547257877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.0459744130494073, + "learning_rate": 7.709999999999999e-06, + "loss": 0.0018, + "num_tokens": 32258082.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 289.59375, + "completions/mean_terminated_length": 289.59375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.07711891375835367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.05580032308353111, + "learning_rate": 7.7096e-06, + "loss": 0.0022, + "num_tokens": 32311349.0, + "reward": 2.961806297302246, + "reward_std": 0.3445666432380676, + "rewards/reward_fn/mean": 2.961806297302246, + "rewards/reward_fn/std": 0.34456658363342285, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 220.6875, + "completions/mean_terminated_length": 220.6875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.07722499204412857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.037108064629137516, + "learning_rate": 7.709199999999999e-06, + "loss": 0.0015, + "num_tokens": 32359083.0, + "reward": 3.6539883613586426, + "reward_std": 0.5620724558830261, + "rewards/reward_fn/mean": 3.6539883613586426, + "rewards/reward_fn/std": 0.5620723962783813, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 319.0625, + "completions/mean_terminated_length": 319.0625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.07733107032990347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.06664447928778827, + "learning_rate": 7.7088e-06, + "loss": 0.0027, + "num_tokens": 32406445.0, + "reward": 2.7638401985168457, + "reward_std": 0.045684244483709335, + "rewards/reward_fn/mean": 2.7638401985168457, + "rewards/reward_fn/std": 0.045684244483709335, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 186.78125, + "completions/mean_terminated_length": 186.78125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.07743714861567837, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.07360926462570205, + "learning_rate": 7.7084e-06, + "loss": 0.0029, + "num_tokens": 32430950.0, + "reward": 3.3820438385009766, + "reward_std": 0.6281734704971313, + "rewards/reward_fn/mean": 3.3820438385009766, + "rewards/reward_fn/std": 0.6281735301017761, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 174.9375, + "completions/mean_terminated_length": 174.9375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.07754322690145327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.07646584114991128, + "learning_rate": 7.708e-06, + "loss": 0.0031, + "num_tokens": 32479588.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 81.25, + "completions/mean_terminated_length": 81.25, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.07764930518722818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.022484338143840432, + "learning_rate": 7.7076e-06, + "loss": 0.0009, + "num_tokens": 32502924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 231.125, + "completions/mean_terminated_length": 231.125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.07775538347300308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.07375269406475127, + "learning_rate": 7.7072e-06, + "loss": 0.003, + "num_tokens": 32550960.0, + "reward": 3.9716763496398926, + "reward_std": 0.16022291779518127, + "rewards/reward_fn/mean": 3.9716763496398926, + "rewards/reward_fn/std": 0.16022291779518127, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 194.03125, + "completions/mean_terminated_length": 194.03125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.07786146175877798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.052297455724328756, + "learning_rate": 7.7068e-06, + "loss": 0.0021, + "num_tokens": 32592881.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.07796754004455288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.04508164117578417, + "learning_rate": 7.7064e-06, + "loss": 0.0018, + "num_tokens": 32650485.0, + "reward": 3.0846705436706543, + "reward_std": 0.07166870683431625, + "rewards/reward_fn/mean": 3.0846705436706543, + "rewards/reward_fn/std": 0.07166869193315506, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 84.71875, + "completions/mean_terminated_length": 84.71875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.07807361833032778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.025161666912026703, + "learning_rate": 7.706e-06, + "loss": 0.001, + "num_tokens": 32696460.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 200.65625, + "completions/mean_terminated_length": 200.65625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.07817969661610269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.044102601415943354, + "learning_rate": 7.7056e-06, + "loss": 0.0018, + "num_tokens": 32737377.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.07828577490187759, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.04195326648186892, + "learning_rate": 7.705199999999999e-06, + "loss": 0.0017, + "num_tokens": 32762997.0, + "reward": 2.9239230155944824, + "reward_std": 0.020110029727220535, + "rewards/reward_fn/mean": 2.9239230155944824, + "rewards/reward_fn/std": 0.02011003904044628, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 279.125, + "completions/mean_terminated_length": 279.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.07839185318765249, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.048967642593197525, + "learning_rate": 7.7048e-06, + "loss": 0.002, + "num_tokens": 32810553.0, + "reward": 3.0665769577026367, + "reward_std": 0.17190836369991302, + "rewards/reward_fn/mean": 3.0665769577026367, + "rewards/reward_fn/std": 0.17190837860107422, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 242.21875, + "completions/mean_terminated_length": 242.21875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.07849793147342739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.05765526695176959, + "learning_rate": 7.704399999999999e-06, + "loss": 0.0023, + "num_tokens": 32882304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 119.625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.07860400975920229, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.05220557638676837, + "learning_rate": 7.704e-06, + "loss": 0.0021, + "num_tokens": 32919092.0, + "reward": 3.2061731815338135, + "reward_std": 0.0629623532295227, + "rewards/reward_fn/mean": 3.2061731815338135, + "rewards/reward_fn/std": 0.06296232342720032, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 164.40625, + "completions/mean_terminated_length": 164.40625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.0787100880449772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.04799942090176046, + "learning_rate": 7.7036e-06, + "loss": 0.0019, + "num_tokens": 32954721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 217.0625, + "completions/mean_terminated_length": 217.0625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0788161663307521, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.04830415640026331, + "learning_rate": 7.7032e-06, + "loss": 0.0019, + "num_tokens": 33002563.0, + "reward": 3.9048845767974854, + "reward_std": 0.3005771040916443, + "rewards/reward_fn/mean": 3.9048845767974854, + "rewards/reward_fn/std": 0.3005771338939667, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 330.90625, + "completions/mean_terminated_length": 330.90625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.078922244616527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.05085050774505362, + "learning_rate": 7.7028e-06, + "loss": 0.002, + "num_tokens": 33033440.0, + "reward": 2.793060302734375, + "reward_std": 0.5311539173126221, + "rewards/reward_fn/mean": 2.793060302734375, + "rewards/reward_fn/std": 0.5311539173126221, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 99.71875, + "completions/mean_terminated_length": 99.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.0790283229023019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.019555108272470534, + "learning_rate": 7.7024e-06, + "loss": 0.0008, + "num_tokens": 33075415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 207.15625, + "completions/mean_terminated_length": 207.15625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.0791344011880768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.03279089758871123, + "learning_rate": 7.702e-06, + "loss": 0.0013, + "num_tokens": 33118108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 206.4375, + "completions/mean_terminated_length": 206.4375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.0792404794738517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.04685262369457632, + "learning_rate": 7.7016e-06, + "loss": 0.0019, + "num_tokens": 33146314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 149.9375, + "completions/mean_terminated_length": 149.9375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.07934655775962661, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.06066811521304771, + "learning_rate": 7.7012e-06, + "loss": 0.0024, + "num_tokens": 33171688.0, + "reward": 3.004272699356079, + "reward_std": 0.037238411605358124, + "rewards/reward_fn/mean": 3.004272699356079, + "rewards/reward_fn/std": 0.037238407880067825, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 112.84375, + "completions/mean_terminated_length": 112.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.07945263604540151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.07324639323633164, + "learning_rate": 7.7008e-06, + "loss": 0.0029, + "num_tokens": 33221411.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 129.59375, + "completions/mean_terminated_length": 129.59375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.07955871433117641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.023909973591798916, + "learning_rate": 7.7004e-06, + "loss": 0.001, + "num_tokens": 33269782.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 170.65625, + "completions/mean_terminated_length": 170.65625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.07966479261695131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.06489493430126458, + "learning_rate": 7.699999999999999e-06, + "loss": 0.0026, + "num_tokens": 33311339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 208.28125, + "completions/mean_terminated_length": 208.28125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.07977087090272621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.06098278262652457, + "learning_rate": 7.6996e-06, + "loss": 0.0024, + "num_tokens": 33369396.0, + "reward": 3.959812641143799, + "reward_std": 0.22733472287654877, + "rewards/reward_fn/mean": 3.959812641143799, + "rewards/reward_fn/std": 0.22733475267887115, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 113.96875, + "completions/mean_terminated_length": 113.96875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.07987694918850112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.04324968159198761, + "learning_rate": 7.6992e-06, + "loss": 0.0017, + "num_tokens": 33407827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 557.0, + "completions/mean_terminated_length": 557.0, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.07998302747427602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.04723305656807497, + "learning_rate": 7.6988e-06, + "loss": 0.0019, + "num_tokens": 33470643.0, + "reward": 2.8307480812072754, + "reward_std": 0.05510137230157852, + "rewards/reward_fn/mean": 2.8307480812072754, + "rewards/reward_fn/std": 0.05510134622454643, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 96.25, + "completions/mean_terminated_length": 96.25, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.08008910576005092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.03696786391083151, + "learning_rate": 7.6984e-06, + "loss": 0.0015, + "num_tokens": 33511419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 142.75, + "completions/mean_terminated_length": 142.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.08019518404582582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.04854514845646918, + "learning_rate": 7.698e-06, + "loss": 0.0019, + "num_tokens": 33545587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 175.09375, + "completions/mean_terminated_length": 175.09375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.08030126233160072, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.039331718639004976, + "learning_rate": 7.6976e-06, + "loss": 0.0016, + "num_tokens": 33587734.0, + "reward": 2.9345703125, + "reward_std": 0.014557859860360622, + "rewards/reward_fn/mean": 2.9345703125, + "rewards/reward_fn/std": 0.014557869173586369, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 176.34375, + "completions/mean_terminated_length": 176.34375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.08040734061737562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.039623707241844386, + "learning_rate": 7.6972e-06, + "loss": 0.0016, + "num_tokens": 33611521.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.08051341890315053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.05152698885649443, + "learning_rate": 7.696799999999999e-06, + "loss": 0.0021, + "num_tokens": 33652117.0, + "reward": 3.8735275268554688, + "reward_std": 0.2986966669559479, + "rewards/reward_fn/mean": 3.8735275268554688, + "rewards/reward_fn/std": 0.2986966669559479, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.08061949718892543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.04387016425607726, + "learning_rate": 7.6964e-06, + "loss": 0.0018, + "num_tokens": 33682211.0, + "reward": 2.870743751525879, + "reward_std": 0.029427386820316315, + "rewards/reward_fn/mean": 2.870743751525879, + "rewards/reward_fn/std": 0.029427384957671165, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 336.125, + "completions/mean_terminated_length": 336.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.08072557547470033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.04809122672304511, + "learning_rate": 7.695999999999999e-06, + "loss": 0.0019, + "num_tokens": 33731975.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 180.53125, + "completions/mean_terminated_length": 180.53125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.08083165376047523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0938791740918532, + "learning_rate": 7.6956e-06, + "loss": 0.0038, + "num_tokens": 33758904.0, + "reward": 3.966001272201538, + "reward_std": 0.19232597947120667, + "rewards/reward_fn/mean": 3.966001272201538, + "rewards/reward_fn/std": 0.19232596457004547, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 293.84375, + "completions/mean_terminated_length": 293.84375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.08093773204625013, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.1058124101255089, + "learning_rate": 7.695199999999999e-06, + "loss": 0.0042, + "num_tokens": 33796659.0, + "reward": 2.5515646934509277, + "reward_std": 0.5192806720733643, + "rewards/reward_fn/mean": 2.5515646934509277, + "rewards/reward_fn/std": 0.5192806720733643, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 436.25, + "completions/mean_terminated_length": 436.25, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.08104381033202504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.061225235112942755, + "learning_rate": 7.6948e-06, + "loss": 0.0024, + "num_tokens": 33843131.0, + "reward": 2.933668375015259, + "reward_std": 0.052915628999471664, + "rewards/reward_fn/mean": 2.933668375015259, + "rewards/reward_fn/std": 0.05291564017534256, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 230.96875, + "completions/mean_terminated_length": 230.96875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.08114988861779994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.050095128593966365, + "learning_rate": 7.6944e-06, + "loss": 0.002, + "num_tokens": 33868250.0, + "reward": 3.302189350128174, + "reward_std": 0.039182018488645554, + "rewards/reward_fn/mean": 3.302189350128174, + "rewards/reward_fn/std": 0.03918198496103287, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 231.65625, + "completions/mean_terminated_length": 231.65625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.08125596690357484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.03903214994352311, + "learning_rate": 7.694e-06, + "loss": 0.0016, + "num_tokens": 33926703.0, + "reward": 3.2802817821502686, + "reward_std": 0.5692557096481323, + "rewards/reward_fn/mean": 3.2802817821502686, + "rewards/reward_fn/std": 0.5692556500434875, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 307.5625, + "completions/mean_terminated_length": 307.5625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.08136204518934974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.03617817087797448, + "learning_rate": 7.6936e-06, + "loss": 0.0014, + "num_tokens": 33986145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 385.15625, + "completions/mean_terminated_length": 385.15625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.08146812347512464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.07895687408745289, + "learning_rate": 7.6932e-06, + "loss": 0.0032, + "num_tokens": 34029030.0, + "reward": 3.0626044273376465, + "reward_std": 0.08735579997301102, + "rewards/reward_fn/mean": 3.0626044273376465, + "rewards/reward_fn/std": 0.08735582232475281, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 132.875, + "completions/mean_terminated_length": 132.875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.08157420176089955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.0394937681267038, + "learning_rate": 7.6928e-06, + "loss": 0.0016, + "num_tokens": 34062274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 192.40625, + "completions/mean_terminated_length": 192.40625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.08168028004667445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.05581576417898759, + "learning_rate": 7.6924e-06, + "loss": 0.0022, + "num_tokens": 34101487.0, + "reward": 2.8739709854125977, + "reward_std": 0.07851967960596085, + "rewards/reward_fn/mean": 2.8739709854125977, + "rewards/reward_fn/std": 0.07851970195770264, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 137.15625, + "completions/mean_terminated_length": 137.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.08178635833244935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.047419965849258006, + "learning_rate": 7.692e-06, + "loss": 0.0019, + "num_tokens": 34126324.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 207.03125, + "completions/mean_terminated_length": 207.03125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.08189243661822425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.02880560705671087, + "learning_rate": 7.6916e-06, + "loss": 0.0012, + "num_tokens": 34164597.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 262.96875, + "completions/mean_terminated_length": 262.96875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.08199851490399915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.034354471892584115, + "learning_rate": 7.6912e-06, + "loss": 0.0014, + "num_tokens": 34206228.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 174.96875, + "completions/mean_terminated_length": 174.96875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.08210459318977405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.03377506849938072, + "learning_rate": 7.6908e-06, + "loss": 0.0014, + "num_tokens": 34263987.0, + "reward": 3.027437686920166, + "reward_std": 0.023412281647324562, + "rewards/reward_fn/mean": 3.027437686920166, + "rewards/reward_fn/std": 0.023412277922034264, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 188.4375, + "completions/mean_terminated_length": 188.4375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.08221067147554896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.03890780231449753, + "learning_rate": 7.6904e-06, + "loss": 0.0016, + "num_tokens": 34304385.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 283.65625, + "completions/mean_terminated_length": 283.65625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.08231674976132386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.03191797385807149, + "learning_rate": 7.69e-06, + "loss": 0.0013, + "num_tokens": 34345942.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 132.625, + "completions/mean_terminated_length": 132.625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.08242282804709876, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.06634008465334773, + "learning_rate": 7.6896e-06, + "loss": 0.0026, + "num_tokens": 34385514.0, + "reward": 2.8560264110565186, + "reward_std": 0.024654332548379898, + "rewards/reward_fn/mean": 2.8560264110565186, + "rewards/reward_fn/std": 0.024654347449541092, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 202.5, + "completions/mean_terminated_length": 202.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.08252890633287366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.07904170651454479, + "learning_rate": 7.6892e-06, + "loss": 0.0032, + "num_tokens": 34429882.0, + "reward": 3.814371109008789, + "reward_std": 0.39530250430107117, + "rewards/reward_fn/mean": 3.814371109008789, + "rewards/reward_fn/std": 0.3953024744987488, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 86.90625, + "completions/mean_terminated_length": 86.90625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.08263498461864856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.021457875292981043, + "learning_rate": 7.6888e-06, + "loss": 0.0009, + "num_tokens": 34468695.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.08274106290442347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.054818932549096644, + "learning_rate": 7.688399999999999e-06, + "loss": 0.0022, + "num_tokens": 34513159.0, + "reward": 2.7019734382629395, + "reward_std": 0.16798020899295807, + "rewards/reward_fn/mean": 2.7019734382629395, + "rewards/reward_fn/std": 0.16798023879528046, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.08284714119019837, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.06239066878333688, + "learning_rate": 7.688e-06, + "loss": 0.0025, + "num_tokens": 34552759.0, + "reward": 2.781367301940918, + "reward_std": 0.028873471543192863, + "rewards/reward_fn/mean": 2.781367301940918, + "rewards/reward_fn/std": 0.028873484581708908, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 118.96875, + "completions/mean_terminated_length": 118.96875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.08295321947597327, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.013791139441309497, + "learning_rate": 7.687599999999999e-06, + "loss": 0.0005, + "num_tokens": 34589942.0, + "reward": 3.1045916080474854, + "reward_std": 0.0047982302494347095, + "rewards/reward_fn/mean": 3.1045916080474854, + "rewards/reward_fn/std": 0.004798218607902527, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 412.84375, + "completions/mean_terminated_length": 412.84375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.08305929776174817, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.03246723604388535, + "learning_rate": 7.6872e-06, + "loss": 0.0013, + "num_tokens": 34660113.0, + "reward": 3.670332908630371, + "reward_std": 0.7970924377441406, + "rewards/reward_fn/mean": 3.670332908630371, + "rewards/reward_fn/std": 0.7970924377441406, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 240.1875, + "completions/mean_terminated_length": 240.1875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.08316537604752307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.05126189975999296, + "learning_rate": 7.686799999999999e-06, + "loss": 0.0021, + "num_tokens": 34686967.0, + "reward": 3.855468988418579, + "reward_std": 0.3885265290737152, + "rewards/reward_fn/mean": 3.855468988418579, + "rewards/reward_fn/std": 0.3885265290737152, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 725.59375, + "completions/mean_terminated_length": 725.59375, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.08327145433329797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.08870382863096893, + "learning_rate": 7.6864e-06, + "loss": 0.0035, + "num_tokens": 34748234.0, + "reward": 2.293332815170288, + "reward_std": 0.5246787071228027, + "rewards/reward_fn/mean": 2.293332815170288, + "rewards/reward_fn/std": 0.5246787667274475, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 200.59375, + "completions/mean_terminated_length": 200.59375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.08337753261907288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.04654703033156693, + "learning_rate": 7.685999999999999e-06, + "loss": 0.0019, + "num_tokens": 34801277.0, + "reward": 3.829470157623291, + "reward_std": 0.36089715361595154, + "rewards/reward_fn/mean": 3.829470157623291, + "rewards/reward_fn/std": 0.3608972132205963, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 158.9375, + "completions/mean_terminated_length": 158.9375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.08348361090484778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.07684183481615037, + "learning_rate": 7.6856e-06, + "loss": 0.0031, + "num_tokens": 34840795.0, + "reward": 2.852140188217163, + "reward_std": 0.03964653238654137, + "rewards/reward_fn/mean": 2.852140188217163, + "rewards/reward_fn/std": 0.03964650258421898, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.08358968919062268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.048744586820248514, + "learning_rate": 7.685199999999999e-06, + "loss": 0.0019, + "num_tokens": 34878458.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 150.53125, + "completions/mean_terminated_length": 150.53125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.08369576747639758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.04753570049069822, + "learning_rate": 7.6848e-06, + "loss": 0.0019, + "num_tokens": 34916299.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 283.9375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.08380184576217248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.09668770106509328, + "learning_rate": 7.6844e-06, + "loss": 0.0039, + "num_tokens": 34974889.0, + "reward": 2.762829303741455, + "reward_std": 0.036893703043460846, + "rewards/reward_fn/mean": 2.762829303741455, + "rewards/reward_fn/std": 0.03689371794462204, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 400.75, + "completions/mean_terminated_length": 400.75, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.08390792404794739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.03603732128976844, + "learning_rate": 7.684e-06, + "loss": 0.0014, + "num_tokens": 35032673.0, + "reward": 3.586613178253174, + "reward_std": 0.6132650375366211, + "rewards/reward_fn/mean": 3.586613178253174, + "rewards/reward_fn/std": 0.6132650375366211, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 100.9375, + "completions/mean_terminated_length": 100.9375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.08401400233372229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.05937142693437636, + "learning_rate": 7.6836e-06, + "loss": 0.0024, + "num_tokens": 35063263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 151.46875, + "completions/mean_terminated_length": 151.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.08412008061949719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.06238317512907088, + "learning_rate": 7.6832e-06, + "loss": 0.0025, + "num_tokens": 35102414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 181.90625, + "completions/mean_terminated_length": 181.90625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.08422615890527209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.0344988465658389, + "learning_rate": 7.6828e-06, + "loss": 0.0014, + "num_tokens": 35141195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 249.6875, + "completions/mean_terminated_length": 249.6875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.08433223719104699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.05292107805144042, + "learning_rate": 7.6824e-06, + "loss": 0.0021, + "num_tokens": 35184545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 381.375, + "completions/mean_terminated_length": 381.375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.0844383154768219, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.082465800922364, + "learning_rate": 7.682e-06, + "loss": 0.0033, + "num_tokens": 35226829.0, + "reward": 2.704169750213623, + "reward_std": 0.029407240450382233, + "rewards/reward_fn/mean": 2.704169750213623, + "rewards/reward_fn/std": 0.02940722554922104, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 429.0, + "completions/mean_terminated_length": 429.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.0845443937625968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.08249839709606022, + "learning_rate": 7.6816e-06, + "loss": 0.0033, + "num_tokens": 35277645.0, + "reward": 2.570937156677246, + "reward_std": 0.46163350343704224, + "rewards/reward_fn/mean": 2.570937156677246, + "rewards/reward_fn/std": 0.46163344383239746, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 138.25, + "completions/mean_terminated_length": 138.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.0846504720483717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.052692751720314845, + "learning_rate": 7.681199999999999e-06, + "loss": 0.0021, + "num_tokens": 35307957.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 246.03125, + "completions/mean_terminated_length": 246.03125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.0847565503341466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.049287278146948665, + "learning_rate": 7.6808e-06, + "loss": 0.002, + "num_tokens": 35354038.0, + "reward": 2.8469812870025635, + "reward_std": 0.30314865708351135, + "rewards/reward_fn/mean": 2.8469812870025635, + "rewards/reward_fn/std": 0.30314865708351135, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 194.96875, + "completions/mean_terminated_length": 194.96875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.0848626286199215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.06779409275623038, + "learning_rate": 7.680399999999998e-06, + "loss": 0.0027, + "num_tokens": 35391893.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 20000, + "num_input_tokens_seen": 35391893, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}