diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,57234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23337222870478413, + "eval_steps": 500, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1719.0, + "completions/mean_length": 680.875, + "completions/mean_terminated_length": 427.7037048339844, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.00010607828577490188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 8e-06, + "loss": 0.3843, + "num_tokens": 79324.0, + "reward": 1.4974994659423828, + "reward_std": 0.8458996415138245, + "rewards/reward_fn/mean": 1.4974994659423828, + "rewards/reward_fn/std": 0.8458995819091797, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1127.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 215.25, + "completions/mean_terminated_length": 215.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.00021215657154980376, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.008525475102942437, + "learning_rate": 7.9996e-06, + "loss": 0.0045, + "num_tokens": 116452.0, + "reward": 2.7323050498962402, + "reward_std": 0.185869961977005, + "rewards/reward_fn/mean": 2.7323050498962402, + "rewards/reward_fn/std": 0.1858699470758438, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 81.03125, + "completions/mean_terminated_length": 81.03125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.0003182348573247056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2158203125, + "kl": 0.01689625042490661, + "learning_rate": 7.9992e-06, + "loss": 0.0007, + "num_tokens": 155109.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 332.375, + "completions/mean_terminated_length": 332.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.0004243131430996075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.007169858436100185, + "learning_rate": 7.9988e-06, + "loss": 0.0173, + "num_tokens": 206385.0, + "reward": 2.862459897994995, + "reward_std": 0.067531056702137, + "rewards/reward_fn/mean": 2.862459897994995, + "rewards/reward_fn/std": 0.067531056702137, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 239.96875, + "completions/mean_terminated_length": 239.96875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.0005303914288745094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.012080345884896815, + "learning_rate": 7.9984e-06, + "loss": 0.0005, + "num_tokens": 246896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 268.71875, + "completions/mean_terminated_length": 268.71875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0006364697146494112, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.011115878820419312, + "learning_rate": 7.998e-06, + "loss": 0.0878, + "num_tokens": 284999.0, + "reward": 3.22149658203125, + "reward_std": 0.9693878889083862, + "rewards/reward_fn/mean": 3.22149658203125, + "rewards/reward_fn/std": 0.969387948513031, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1717.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 400.0, + "completions/mean_terminated_length": 400.0, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.0007425480004243131, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.007328710809815675, + "learning_rate": 7.9976e-06, + "loss": -0.119, + "num_tokens": 331847.0, + "reward": 2.9880638122558594, + "reward_std": 0.7341222763061523, + "rewards/reward_fn/mean": 2.9880638122558594, + "rewards/reward_fn/std": 0.7341222763061523, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1423.0, + "completions/mean_length": 340.5625, + "completions/mean_terminated_length": 285.4838562011719, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.000848626286199215, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.00942237873096019, + "learning_rate": 7.9972e-06, + "loss": 0.3223, + "num_tokens": 384729.0, + "reward": 3.8373069763183594, + "reward_std": 0.731940746307373, + "rewards/reward_fn/mean": 3.8373069763183594, + "rewards/reward_fn/std": 0.731940746307373, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 165.0625, + "completions/mean_terminated_length": 165.0625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0009547045719741168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.013073404785245657, + "learning_rate": 7.9968e-06, + "loss": 0.0005, + "num_tokens": 422107.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 356.03125, + "completions/mean_terminated_length": 356.03125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.0010607828577490189, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.008752175432164222, + "learning_rate": 7.9964e-06, + "loss": 0.0989, + "num_tokens": 488924.0, + "reward": 3.7715067863464355, + "reward_std": 0.5657153129577637, + "rewards/reward_fn/mean": 3.7715067863464355, + "rewards/reward_fn/std": 0.5657153725624084, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 132.59375, + "completions/mean_terminated_length": 132.59375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.0011668611435239206, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.013385511934757233, + "learning_rate": 7.996e-06, + "loss": -0.0505, + "num_tokens": 529903.0, + "reward": 3.9310073852539062, + "reward_std": 0.2188190072774887, + "rewards/reward_fn/mean": 3.9310073852539062, + "rewards/reward_fn/std": 0.2188190221786499, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 254.96875, + "completions/mean_terminated_length": 254.96875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.0012729394292988225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.011238898383453488, + "learning_rate": 7.995599999999998e-06, + "loss": 0.0004, + "num_tokens": 570670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1664.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 629.1875, + "completions/mean_terminated_length": 629.1875, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.0013790177150737244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.007070357620250434, + "learning_rate": 7.9952e-06, + "loss": 0.0796, + "num_tokens": 638228.0, + "reward": 3.1080098152160645, + "reward_std": 0.934657096862793, + "rewards/reward_fn/mean": 3.1080098152160645, + "rewards/reward_fn/std": 0.934657096862793, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 623.21875, + "completions/mean_terminated_length": 475.82757568359375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.0014850960008486263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.007602842408232391, + "learning_rate": 7.9948e-06, + "loss": 0.0701, + "num_tokens": 698267.0, + "reward": 2.447237014770508, + "reward_std": 0.8652064204216003, + "rewards/reward_fn/mean": 2.447237014770508, + "rewards/reward_fn/std": 0.8652064204216003, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 247.125, + "completions/mean_terminated_length": 247.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.0015911742866235282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.012013918720185757, + "learning_rate": 7.9944e-06, + "loss": -0.0071, + "num_tokens": 745183.0, + "reward": 3.933253765106201, + "reward_std": 0.2631880044937134, + "rewards/reward_fn/mean": 3.933253765106201, + "rewards/reward_fn/std": 0.263187974691391, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1180.0, + "completions/max_terminated_length": 1180.0, + "completions/mean_length": 416.46875, + "completions/mean_terminated_length": 416.46875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.00169725257239843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.01027172978501767, + "learning_rate": 7.994e-06, + "loss": 0.0592, + "num_tokens": 789742.0, + "reward": 2.780426502227783, + "reward_std": 0.21798433363437653, + "rewards/reward_fn/mean": 2.780426502227783, + "rewards/reward_fn/std": 0.21798425912857056, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 223.59375, + "completions/mean_terminated_length": 223.59375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.001803330858173332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.013605716056190431, + "learning_rate": 7.9936e-06, + "loss": 0.012, + "num_tokens": 835201.0, + "reward": 2.994292736053467, + "reward_std": 0.4485239088535309, + "rewards/reward_fn/mean": 2.994292736053467, + "rewards/reward_fn/std": 0.4485238790512085, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 200.9375, + "completions/mean_terminated_length": 200.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.0019094091439482337, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.011645498918369412, + "learning_rate": 7.9932e-06, + "loss": 0.0843, + "num_tokens": 862143.0, + "reward": 3.747105121612549, + "reward_std": 0.5640392899513245, + "rewards/reward_fn/mean": 3.747105121612549, + "rewards/reward_fn/std": 0.5640392899513245, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 387.59375, + "completions/mean_terminated_length": 387.59375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.002015487429723136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.007694089494179934, + "learning_rate": 7.992799999999999e-06, + "loss": 0.0621, + "num_tokens": 910226.0, + "reward": 3.5295071601867676, + "reward_std": 0.6173213124275208, + "rewards/reward_fn/mean": 3.5295071601867676, + "rewards/reward_fn/std": 0.617321252822876, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 208.96875, + "completions/mean_terminated_length": 208.96875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.0021215657154980377, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.015368042862974107, + "learning_rate": 7.9924e-06, + "loss": 0.0099, + "num_tokens": 940337.0, + "reward": 3.5482964515686035, + "reward_std": 0.8371409177780151, + "rewards/reward_fn/mean": 3.5482964515686035, + "rewards/reward_fn/std": 0.8371408581733704, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.0022276440012729396, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.014460677281022072, + "learning_rate": 7.991999999999999e-06, + "loss": -0.0116, + "num_tokens": 960817.0, + "reward": 3.8916022777557373, + "reward_std": 0.45123252272605896, + "rewards/reward_fn/mean": 3.8916022777557373, + "rewards/reward_fn/std": 0.4512324333190918, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 380.53125, + "completions/mean_terminated_length": 380.53125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.002333722287047841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.010017184424214065, + "learning_rate": 7.9916e-06, + "loss": 0.0591, + "num_tokens": 1025506.0, + "reward": 2.726224184036255, + "reward_std": 0.2123180776834488, + "rewards/reward_fn/mean": 2.726224184036255, + "rewards/reward_fn/std": 0.2123180478811264, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 537.15625, + "completions/mean_terminated_length": 537.15625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.002439800572822743, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.008808196464087814, + "learning_rate": 7.991199999999999e-06, + "loss": -0.0658, + "num_tokens": 1081447.0, + "reward": 2.7414865493774414, + "reward_std": 0.947241485118866, + "rewards/reward_fn/mean": 2.7414865493774414, + "rewards/reward_fn/std": 0.9472415447235107, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 412.5625, + "completions/mean_terminated_length": 412.5625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.002545878858597645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.009269667672924697, + "learning_rate": 7.9908e-06, + "loss": -0.0071, + "num_tokens": 1141273.0, + "reward": 2.676821231842041, + "reward_std": 0.5632266998291016, + "rewards/reward_fn/mean": 2.676821231842041, + "rewards/reward_fn/std": 0.5632267594337463, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.002651957144372547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.011170931567903608, + "learning_rate": 7.9904e-06, + "loss": 0.0352, + "num_tokens": 1196913.0, + "reward": 3.0973877906799316, + "reward_std": 0.7358191609382629, + "rewards/reward_fn/mean": 3.0973877906799316, + "rewards/reward_fn/std": 0.7358191609382629, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 245.03125, + "completions/mean_terminated_length": 245.03125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.0027580354301474487, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.01611144037451595, + "learning_rate": 7.99e-06, + "loss": 0.1088, + "num_tokens": 1239122.0, + "reward": 2.7557084560394287, + "reward_std": 0.3039965033531189, + "rewards/reward_fn/mean": 2.7557084560394287, + "rewards/reward_fn/std": 0.3039964735507965, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 174.46875, + "completions/mean_terminated_length": 174.46875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.0028641137159223506, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.016508221509866416, + "learning_rate": 7.9896e-06, + "loss": 0.0277, + "num_tokens": 1273281.0, + "reward": 3.8924479484558105, + "reward_std": 0.4453147053718567, + "rewards/reward_fn/mean": 3.8924479484558105, + "rewards/reward_fn/std": 0.4453147053718567, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 229.53125, + "completions/mean_terminated_length": 229.53125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.0029701920016972526, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.01914752251468599, + "learning_rate": 7.9892e-06, + "loss": -0.0023, + "num_tokens": 1329714.0, + "reward": 3.8488998413085938, + "reward_std": 0.40621453523635864, + "rewards/reward_fn/mean": 3.8488998413085938, + "rewards/reward_fn/std": 0.40621453523635864, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 632.59375, + "completions/mean_terminated_length": 538.2333374023438, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.0030762702874721545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01128252933267504, + "learning_rate": 7.9888e-06, + "loss": 0.2658, + "num_tokens": 1391141.0, + "reward": 2.293217658996582, + "reward_std": 0.7491805553436279, + "rewards/reward_fn/mean": 2.293217658996582, + "rewards/reward_fn/std": 0.7491805553436279, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 457.8125, + "completions/mean_terminated_length": 457.8125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.0031823485732470564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.00988059863448143, + "learning_rate": 7.9884e-06, + "loss": -0.0261, + "num_tokens": 1421887.0, + "reward": 1.8365099430084229, + "reward_std": 0.3920939862728119, + "rewards/reward_fn/mean": 1.8365099430084229, + "rewards/reward_fn/std": 0.3920939564704895, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 234.03125, + "completions/mean_terminated_length": 234.03125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.0032884268590219583, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.016112850280478597, + "learning_rate": 7.988e-06, + "loss": 0.0618, + "num_tokens": 1464800.0, + "reward": 3.961313009262085, + "reward_std": 0.21884705126285553, + "rewards/reward_fn/mean": 3.961313009262085, + "rewards/reward_fn/std": 0.21884708106517792, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 319.59375, + "completions/mean_terminated_length": 319.59375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.00339450514479686, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.015100088901817799, + "learning_rate": 7.9876e-06, + "loss": 0.1985, + "num_tokens": 1511891.0, + "reward": 2.7325994968414307, + "reward_std": 0.19626843929290771, + "rewards/reward_fn/mean": 2.7325994968414307, + "rewards/reward_fn/std": 0.19626840949058533, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 215.96875, + "completions/mean_terminated_length": 215.96875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.003500583430571762, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.013935832539573312, + "learning_rate": 7.987199999999999e-06, + "loss": 0.076, + "num_tokens": 1553202.0, + "reward": 3.828774929046631, + "reward_std": 0.564949631690979, + "rewards/reward_fn/mean": 3.828774929046631, + "rewards/reward_fn/std": 0.564949631690979, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.003606661716346664, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.021082005463540554, + "learning_rate": 7.9868e-06, + "loss": 0.0509, + "num_tokens": 1598394.0, + "reward": 3.139282464981079, + "reward_std": 0.714324951171875, + "rewards/reward_fn/mean": 3.139282464981079, + "rewards/reward_fn/std": 0.7143248915672302, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 389.8125, + "completions/mean_terminated_length": 336.32257080078125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.003712740002121566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.014661112450994551, + "learning_rate": 7.986399999999999e-06, + "loss": 0.1028, + "num_tokens": 1643572.0, + "reward": 2.1648662090301514, + "reward_std": 0.8039337992668152, + "rewards/reward_fn/mean": 2.1648662090301514, + "rewards/reward_fn/std": 0.8039337396621704, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.0038188182878964674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.017005216563120484, + "learning_rate": 7.986e-06, + "loss": -0.0162, + "num_tokens": 1687534.0, + "reward": 2.9149179458618164, + "reward_std": 0.5480148196220398, + "rewards/reward_fn/mean": 2.9149179458618164, + "rewards/reward_fn/std": 0.548014760017395, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 226.96875, + "completions/mean_terminated_length": 226.96875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.00392489657367137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.019651910522952676, + "learning_rate": 7.9856e-06, + "loss": -0.0033, + "num_tokens": 1750637.0, + "reward": 3.9629361629486084, + "reward_std": 0.20966489613056183, + "rewards/reward_fn/mean": 3.9629361629486084, + "rewards/reward_fn/std": 0.2096649408340454, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 197.71875, + "completions/mean_terminated_length": 197.71875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.004030974859446272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.011529933894053102, + "learning_rate": 7.9852e-06, + "loss": 0.0005, + "num_tokens": 1800452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 299.71875, + "completions/mean_terminated_length": 299.71875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.0041370531452211735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.01613133493810892, + "learning_rate": 7.9848e-06, + "loss": 0.0467, + "num_tokens": 1848923.0, + "reward": 3.5235185623168945, + "reward_std": 0.5581008195877075, + "rewards/reward_fn/mean": 3.5235185623168945, + "rewards/reward_fn/std": 0.5581007599830627, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.0042431314309960754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.01886003592517227, + "learning_rate": 7.9844e-06, + "loss": 0.0008, + "num_tokens": 1891123.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.004349209716770977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.018136992235668004, + "learning_rate": 7.984e-06, + "loss": 0.0007, + "num_tokens": 1931971.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 967.84375, + "completions/mean_terminated_length": 856.1034545898438, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.004455288002545879, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.010472592199221253, + "learning_rate": 7.9836e-06, + "loss": 0.2691, + "num_tokens": 1998782.0, + "reward": 1.5388916730880737, + "reward_std": 0.5033364295959473, + "rewards/reward_fn/mean": 1.5388916730880737, + "rewards/reward_fn/std": 0.5033363699913025, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 342.9375, + "completions/mean_terminated_length": 287.93548583984375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.00456136628832078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.013179327361285686, + "learning_rate": 7.9832e-06, + "loss": 0.1886, + "num_tokens": 2051996.0, + "reward": 3.7020716667175293, + "reward_std": 0.7320355772972107, + "rewards/reward_fn/mean": 3.7020716667175293, + "rewards/reward_fn/std": 0.7320355772972107, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 215.46875, + "completions/mean_terminated_length": 215.46875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.004667444574095682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.020010275882668793, + "learning_rate": 7.9828e-06, + "loss": 0.0128, + "num_tokens": 2097803.0, + "reward": 2.742847204208374, + "reward_std": 0.29240480065345764, + "rewards/reward_fn/mean": 2.742847204208374, + "rewards/reward_fn/std": 0.2924048602581024, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 160.34375, + "completions/mean_terminated_length": 160.34375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.004773522859870584, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.014989322167821229, + "learning_rate": 7.9824e-06, + "loss": 0.1524, + "num_tokens": 2121654.0, + "reward": 3.8570432662963867, + "reward_std": 0.3882334530353546, + "rewards/reward_fn/mean": 3.8570432662963867, + "rewards/reward_fn/std": 0.3882334530353546, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 79.71875, + "completions/mean_terminated_length": 79.71875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.004879601145645486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28515625, + "kl": 0.026190617179963738, + "learning_rate": 7.981999999999999e-06, + "loss": 0.001, + "num_tokens": 2166573.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 449.5, + "completions/mean_terminated_length": 449.5, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.004985679431420388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.012275198707357049, + "learning_rate": 7.9816e-06, + "loss": 0.0033, + "num_tokens": 2217437.0, + "reward": 2.856170654296875, + "reward_std": 0.6345119476318359, + "rewards/reward_fn/mean": 2.856170654296875, + "rewards/reward_fn/std": 0.6345118880271912, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 306.9375, + "completions/mean_terminated_length": 306.9375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.00509175771719529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.01920297823380679, + "learning_rate": 7.9812e-06, + "loss": -0.0138, + "num_tokens": 2275419.0, + "reward": 3.279310703277588, + "reward_std": 1.086318016052246, + "rewards/reward_fn/mean": 3.279310703277588, + "rewards/reward_fn/std": 1.086318016052246, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 594.375, + "completions/mean_terminated_length": 594.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.005197836002970192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.012267176411114633, + "learning_rate": 7.9808e-06, + "loss": -0.0321, + "num_tokens": 2330535.0, + "reward": 2.3844780921936035, + "reward_std": 0.69173663854599, + "rewards/reward_fn/mean": 2.3844780921936035, + "rewards/reward_fn/std": 0.6917366981506348, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 249.34375, + "completions/mean_terminated_length": 249.34375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.005303914288745094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.01791852479800582, + "learning_rate": 7.9804e-06, + "loss": 0.0302, + "num_tokens": 2371474.0, + "reward": 3.0003445148468018, + "reward_std": 0.6457379460334778, + "rewards/reward_fn/mean": 3.0003445148468018, + "rewards/reward_fn/std": 0.6457379460334778, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 387.53125, + "completions/mean_terminated_length": 333.9677429199219, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.0054099925745199956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.013383313198573887, + "learning_rate": 7.98e-06, + "loss": 0.1146, + "num_tokens": 2425219.0, + "reward": 2.647240161895752, + "reward_std": 0.7234665155410767, + "rewards/reward_fn/mean": 2.647240161895752, + "rewards/reward_fn/std": 0.7234665155410767, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 232.125, + "completions/mean_terminated_length": 232.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.0055160708602948975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.022899470990523696, + "learning_rate": 7.979599999999999e-06, + "loss": -0.0405, + "num_tokens": 2461511.0, + "reward": 3.963773488998413, + "reward_std": 0.20492826402187347, + "rewards/reward_fn/mean": 3.963773488998413, + "rewards/reward_fn/std": 0.20492829382419586, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 172.5625, + "completions/mean_terminated_length": 172.5625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.005622149146069799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.01780761929694563, + "learning_rate": 7.9792e-06, + "loss": 0.0007, + "num_tokens": 2512409.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 513.375, + "completions/mean_terminated_length": 513.375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.005728227431844701, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.014814406633377075, + "learning_rate": 7.978799999999999e-06, + "loss": 0.0129, + "num_tokens": 2564133.0, + "reward": 2.456674575805664, + "reward_std": 0.6039776802062988, + "rewards/reward_fn/mean": 2.456674575805664, + "rewards/reward_fn/std": 0.603977620601654, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1566.0, + "completions/mean_length": 824.21875, + "completions/mean_terminated_length": 649.3928833007812, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.005834305717619603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.01389779313467443, + "learning_rate": 7.9784e-06, + "loss": 0.3627, + "num_tokens": 2627980.0, + "reward": 2.1529626846313477, + "reward_std": 0.9413007497787476, + "rewards/reward_fn/mean": 2.1529626846313477, + "rewards/reward_fn/std": 0.9413006901741028, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 473.125, + "completions/mean_terminated_length": 422.32257080078125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.005940384003394505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.013784538023173809, + "learning_rate": 7.977999999999999e-06, + "loss": 0.1598, + "num_tokens": 2673968.0, + "reward": 3.6430978775024414, + "reward_std": 0.9269182682037354, + "rewards/reward_fn/mean": 3.6430978775024414, + "rewards/reward_fn/std": 0.9269182682037354, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 157.90625, + "completions/mean_terminated_length": 157.90625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.006046462289169407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.013763216906227171, + "learning_rate": 7.9776e-06, + "loss": 0.0307, + "num_tokens": 2732653.0, + "reward": 3.931234836578369, + "reward_std": 0.3889950215816498, + "rewards/reward_fn/mean": 3.931234836578369, + "rewards/reward_fn/std": 0.3889950215816498, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 297.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.006152540574944309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.011821337277069688, + "learning_rate": 7.977199999999999e-06, + "loss": 0.1431, + "num_tokens": 2788469.0, + "reward": 3.0665931701660156, + "reward_std": 0.08242907375097275, + "rewards/reward_fn/mean": 3.0665931701660156, + "rewards/reward_fn/std": 0.08242906630039215, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 193.65625, + "completions/mean_terminated_length": 193.65625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.006258618860719211, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.021005364251323044, + "learning_rate": 7.9768e-06, + "loss": 0.0154, + "num_tokens": 2829578.0, + "reward": 2.730905532836914, + "reward_std": 0.29996997117996216, + "rewards/reward_fn/mean": 2.730905532836914, + "rewards/reward_fn/std": 0.29996997117996216, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 298.59375, + "completions/mean_terminated_length": 298.59375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.006364697146494113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.018938414519652724, + "learning_rate": 7.9764e-06, + "loss": -0.063, + "num_tokens": 2892893.0, + "reward": 3.3888978958129883, + "reward_std": 0.7617525458335876, + "rewards/reward_fn/mean": 3.3888978958129883, + "rewards/reward_fn/std": 0.7617525458335876, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.006470775432269015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.020086537464521825, + "learning_rate": 7.976e-06, + "loss": -0.0565, + "num_tokens": 2946067.0, + "reward": 3.9270145893096924, + "reward_std": 0.4128677546977997, + "rewards/reward_fn/mean": 3.9270145893096924, + "rewards/reward_fn/std": 0.4128677546977997, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 351.71875, + "completions/mean_terminated_length": 351.71875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.0065768537180439166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.015736156376078725, + "learning_rate": 7.9756e-06, + "loss": 0.0296, + "num_tokens": 2986666.0, + "reward": 2.2112362384796143, + "reward_std": 0.4881065785884857, + "rewards/reward_fn/mean": 2.2112362384796143, + "rewards/reward_fn/std": 0.4881065785884857, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 321.15625, + "completions/mean_terminated_length": 321.15625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.0066829320038188185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.020156427985057235, + "learning_rate": 7.9752e-06, + "loss": -0.0779, + "num_tokens": 3028399.0, + "reward": 3.290408134460449, + "reward_std": 0.9479041695594788, + "rewards/reward_fn/mean": 3.290408134460449, + "rewards/reward_fn/std": 0.9479042291641235, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 186.84375, + "completions/mean_terminated_length": 186.84375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.00678901028959372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.020627434947527945, + "learning_rate": 7.9748e-06, + "loss": 0.0008, + "num_tokens": 3068138.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 366.90625, + "completions/mean_terminated_length": 366.90625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.006895088575368622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.016033415216952562, + "learning_rate": 7.9744e-06, + "loss": -0.0287, + "num_tokens": 3117607.0, + "reward": 3.5806355476379395, + "reward_std": 0.6227185130119324, + "rewards/reward_fn/mean": 3.5806355476379395, + "rewards/reward_fn/std": 0.6227185130119324, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 466.875, + "completions/mean_terminated_length": 466.875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.007001166861143524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.014832770335488021, + "learning_rate": 7.974e-06, + "loss": -0.007, + "num_tokens": 3199139.0, + "reward": 3.7774386405944824, + "reward_std": 0.7030869126319885, + "rewards/reward_fn/mean": 3.7774386405944824, + "rewards/reward_fn/std": 0.7030869126319885, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 424.65625, + "completions/mean_terminated_length": 424.65625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.007107245146918426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.014696288155391812, + "learning_rate": 7.9736e-06, + "loss": -0.0192, + "num_tokens": 3249912.0, + "reward": 3.6228115558624268, + "reward_std": 0.7114648818969727, + "rewards/reward_fn/mean": 3.6228115558624268, + "rewards/reward_fn/std": 0.7114648818969727, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 293.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.007213323432693328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.02517524897120893, + "learning_rate": 7.9732e-06, + "loss": -0.0745, + "num_tokens": 3296032.0, + "reward": 2.7536492347717285, + "reward_std": 0.27405858039855957, + "rewards/reward_fn/mean": 2.7536492347717285, + "rewards/reward_fn/std": 0.27405858039855957, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 149.15625, + "completions/mean_terminated_length": 149.15625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.00731940171846823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.02772362041287124, + "learning_rate": 7.9728e-06, + "loss": 0.0011, + "num_tokens": 3344869.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 158.4375, + "completions/mean_terminated_length": 158.4375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.007425480004243132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1513671875, + "kl": 0.03019587113521993, + "learning_rate": 7.9724e-06, + "loss": 0.0012, + "num_tokens": 3386675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1326.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 496.65625, + "completions/mean_terminated_length": 496.65625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.007531558290018034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.014372375677339733, + "learning_rate": 7.972e-06, + "loss": -0.0032, + "num_tokens": 3450248.0, + "reward": 3.8883559703826904, + "reward_std": 0.46508049964904785, + "rewards/reward_fn/mean": 3.8883559703826904, + "rewards/reward_fn/std": 0.46508049964904785, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1761.0, + "completions/max_terminated_length": 1761.0, + "completions/mean_length": 621.75, + "completions/mean_terminated_length": 621.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.007637636575792935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.011013327515684068, + "learning_rate": 7.9716e-06, + "loss": -0.0858, + "num_tokens": 3501984.0, + "reward": 2.3477916717529297, + "reward_std": 0.5000441074371338, + "rewards/reward_fn/mean": 2.3477916717529297, + "rewards/reward_fn/std": 0.5000441074371338, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 145.28125, + "completions/mean_terminated_length": 145.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.007743714861567837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.015902561601251364, + "learning_rate": 7.9712e-06, + "loss": 0.0006, + "num_tokens": 3547529.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 291.71875, + "completions/mean_terminated_length": 291.71875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.00784979314734274, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.018224395578727126, + "learning_rate": 7.9708e-06, + "loss": 0.0441, + "num_tokens": 3604480.0, + "reward": 3.9649553298950195, + "reward_std": 0.19824209809303284, + "rewards/reward_fn/mean": 3.9649553298950195, + "rewards/reward_fn/std": 0.19824212789535522, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 360.15625, + "completions/mean_terminated_length": 360.15625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.00795587143311764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.017522861482575536, + "learning_rate": 7.970399999999999e-06, + "loss": -0.0247, + "num_tokens": 3672165.0, + "reward": 3.7407755851745605, + "reward_std": 0.573284924030304, + "rewards/reward_fn/mean": 3.7407755851745605, + "rewards/reward_fn/std": 0.573284924030304, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 99.71875, + "completions/mean_terminated_length": 99.71875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.008061949718892543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25, + "kl": 0.03133802697993815, + "learning_rate": 7.97e-06, + "loss": 0.0013, + "num_tokens": 3725628.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 389.125, + "completions/mean_terminated_length": 389.125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.008168028004667444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.014990110415965319, + "learning_rate": 7.969599999999999e-06, + "loss": 0.2218, + "num_tokens": 3776256.0, + "reward": 3.630524158477783, + "reward_std": 0.8723556399345398, + "rewards/reward_fn/mean": 3.630524158477783, + "rewards/reward_fn/std": 0.8723556399345398, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 94.4375, + "completions/mean_terminated_length": 94.4375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.008274106290442347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.022003972087986767, + "learning_rate": 7.9692e-06, + "loss": 0.0009, + "num_tokens": 3821742.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 323.96875, + "completions/mean_terminated_length": 323.96875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.008380184576217248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.014573617372661829, + "learning_rate": 7.968799999999999e-06, + "loss": -0.0192, + "num_tokens": 3903693.0, + "reward": 3.9266436100006104, + "reward_std": 0.4149664342403412, + "rewards/reward_fn/mean": 3.9266436100006104, + "rewards/reward_fn/std": 0.414966344833374, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 343.5, + "completions/mean_terminated_length": 343.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.008486262861992151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.020087406621314585, + "learning_rate": 7.9684e-06, + "loss": -0.0134, + "num_tokens": 3961789.0, + "reward": 2.6687002182006836, + "reward_std": 0.5339718461036682, + "rewards/reward_fn/mean": 2.6687002182006836, + "rewards/reward_fn/std": 0.5339718461036682, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 216.4375, + "completions/mean_terminated_length": 216.4375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.008592341147767052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.020208495436236262, + "learning_rate": 7.967999999999999e-06, + "loss": 0.0008, + "num_tokens": 3997675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 260.25, + "completions/mean_terminated_length": 202.5806427001953, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.008698419433541955, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.020875858957879245, + "learning_rate": 7.9676e-06, + "loss": 0.179, + "num_tokens": 4048243.0, + "reward": 3.6548068523406982, + "reward_std": 0.8150468468666077, + "rewards/reward_fn/mean": 3.6548068523406982, + "rewards/reward_fn/std": 0.8150468468666077, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 388.71875, + "completions/mean_terminated_length": 388.71875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.008804497719316856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.01721180323511362, + "learning_rate": 7.967199999999999e-06, + "loss": 0.0637, + "num_tokens": 4095434.0, + "reward": 3.598665714263916, + "reward_std": 0.7542276978492737, + "rewards/reward_fn/mean": 3.598665714263916, + "rewards/reward_fn/std": 0.7542277574539185, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 254.9375, + "completions/mean_terminated_length": 254.9375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.008910576005091759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.02156881894916296, + "learning_rate": 7.9668e-06, + "loss": 0.0009, + "num_tokens": 4131016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 667.65625, + "completions/mean_terminated_length": 575.6333618164062, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.00901665429086666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.014661238761618733, + "learning_rate": 7.9664e-06, + "loss": 0.1817, + "num_tokens": 4186685.0, + "reward": 3.415998935699463, + "reward_std": 1.1627610921859741, + "rewards/reward_fn/mean": 3.415998935699463, + "rewards/reward_fn/std": 1.1627610921859741, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 220.0625, + "completions/mean_terminated_length": 220.0625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.00912273257664156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.025990084279328585, + "learning_rate": 7.966e-06, + "loss": 0.001, + "num_tokens": 4221695.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 187.3125, + "completions/mean_terminated_length": 187.3125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.009228810862416463, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.025184703757986426, + "learning_rate": 7.9656e-06, + "loss": -0.0135, + "num_tokens": 4261897.0, + "reward": 3.9673728942871094, + "reward_std": 0.18456710875034332, + "rewards/reward_fn/mean": 3.9673728942871094, + "rewards/reward_fn/std": 0.18456712365150452, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 194.1875, + "completions/mean_terminated_length": 194.1875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.009334889148191364, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.022489528637379408, + "learning_rate": 7.9652e-06, + "loss": -0.0187, + "num_tokens": 4299119.0, + "reward": 3.961916446685791, + "reward_std": 0.21543395519256592, + "rewards/reward_fn/mean": 3.961916446685791, + "rewards/reward_fn/std": 0.2154339849948883, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 261.6875, + "completions/mean_terminated_length": 261.6875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.009440967433966267, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.022605838836170733, + "learning_rate": 7.9648e-06, + "loss": 0.0996, + "num_tokens": 4336069.0, + "reward": 3.928297281265259, + "reward_std": 0.4056117832660675, + "rewards/reward_fn/mean": 3.928297281265259, + "rewards/reward_fn/std": 0.4056117534637451, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 338.84375, + "completions/mean_terminated_length": 338.84375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.009547045719741168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01582270860671997, + "learning_rate": 7.9644e-06, + "loss": 0.0102, + "num_tokens": 4382624.0, + "reward": 2.761922597885132, + "reward_std": 0.05517864227294922, + "rewards/reward_fn/mean": 2.761922597885132, + "rewards/reward_fn/std": 0.0551786907017231, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 521.09375, + "completions/mean_terminated_length": 471.83868408203125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.009653124005516071, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.017371521913446486, + "learning_rate": 7.964e-06, + "loss": 0.1239, + "num_tokens": 4444611.0, + "reward": 2.9266998767852783, + "reward_std": 0.650079607963562, + "rewards/reward_fn/mean": 2.9266998767852783, + "rewards/reward_fn/std": 0.6500796675682068, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 721.9375, + "completions/mean_terminated_length": 584.7586059570312, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.009759202291290972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.014051761128939688, + "learning_rate": 7.9636e-06, + "loss": 0.2817, + "num_tokens": 4517121.0, + "reward": 2.6343460083007812, + "reward_std": 1.013836145401001, + "rewards/reward_fn/mean": 2.6343460083007812, + "rewards/reward_fn/std": 1.013836145401001, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 215.28125, + "completions/mean_terminated_length": 215.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.009865280577065875, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.02322612050920725, + "learning_rate": 7.963199999999999e-06, + "loss": 0.2232, + "num_tokens": 4553546.0, + "reward": 3.925776481628418, + "reward_std": 0.41987186670303345, + "rewards/reward_fn/mean": 3.925776481628418, + "rewards/reward_fn/std": 0.41987186670303345, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 771.09375, + "completions/mean_terminated_length": 639.0, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.009971358862840776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.013376953662373126, + "learning_rate": 7.9628e-06, + "loss": 0.2941, + "num_tokens": 4608653.0, + "reward": 2.248654365539551, + "reward_std": 0.8817570805549622, + "rewards/reward_fn/mean": 2.248654365539551, + "rewards/reward_fn/std": 0.8817570805549622, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1296.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 284.0625, + "completions/mean_terminated_length": 284.0625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.010077437148615679, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.023674980737268925, + "learning_rate": 7.962399999999999e-06, + "loss": 0.1735, + "num_tokens": 4647151.0, + "reward": 3.8580751419067383, + "reward_std": 0.5584725737571716, + "rewards/reward_fn/mean": 3.8580751419067383, + "rewards/reward_fn/std": 0.5584725737571716, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1389.5625, + "completions/mean_terminated_length": 1205.199951171875, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.01018351543439058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6640625, + "kl": 0.007312611152883619, + "learning_rate": 7.962e-06, + "loss": 0.1159, + "num_tokens": 4724609.0, + "reward": 1.842115879058838, + "reward_std": 0.8588997721672058, + "rewards/reward_fn/mean": 1.842115879058838, + "rewards/reward_fn/std": 0.8588997721672058, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.010289593720165482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.024402482667937875, + "learning_rate": 7.9616e-06, + "loss": -0.0928, + "num_tokens": 4767957.0, + "reward": 3.3132870197296143, + "reward_std": 0.9708902835845947, + "rewards/reward_fn/mean": 3.3132870197296143, + "rewards/reward_fn/std": 0.9708903431892395, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 894.5, + "completions/mean_terminated_length": 817.6000366210938, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.010395672005940384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.00991345732472837, + "learning_rate": 7.9612e-06, + "loss": 0.1967, + "num_tokens": 4841413.0, + "reward": 2.7717971801757812, + "reward_std": 1.1707860231399536, + "rewards/reward_fn/mean": 2.7717971801757812, + "rewards/reward_fn/std": 1.1707861423492432, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 496.0, + "completions/mean_terminated_length": 445.93548583984375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.010501750291715286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.016304848017171025, + "learning_rate": 7.9608e-06, + "loss": 0.2749, + "num_tokens": 4894405.0, + "reward": 2.716887950897217, + "reward_std": 0.5000027418136597, + "rewards/reward_fn/mean": 2.716887950897217, + "rewards/reward_fn/std": 0.5000027418136597, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 207.90625, + "completions/mean_terminated_length": 207.90625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.010607828577490187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.02409674203954637, + "learning_rate": 7.9604e-06, + "loss": -0.0608, + "num_tokens": 4929506.0, + "reward": 3.7197184562683105, + "reward_std": 0.753462016582489, + "rewards/reward_fn/mean": 3.7197184562683105, + "rewards/reward_fn/std": 0.753462016582489, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1949.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 374.3125, + "completions/mean_terminated_length": 374.3125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.01071390686326509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.015362917329184711, + "learning_rate": 7.96e-06, + "loss": 0.0006, + "num_tokens": 4983948.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 286.6875, + "completions/mean_terminated_length": 286.6875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.010819985149039991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.024223619606345892, + "learning_rate": 7.959599999999999e-06, + "loss": -0.098, + "num_tokens": 5024674.0, + "reward": 3.859476089477539, + "reward_std": 0.5529555678367615, + "rewards/reward_fn/mean": 3.859476089477539, + "rewards/reward_fn/std": 0.5529556274414062, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 120.125, + "completions/mean_terminated_length": 120.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.010926063434814894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.017856702557764947, + "learning_rate": 7.9592e-06, + "loss": 0.0007, + "num_tokens": 5048390.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 322.15625, + "completions/mean_terminated_length": 322.15625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.011032141720589795, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.019808009383268654, + "learning_rate": 7.958799999999999e-06, + "loss": 0.0437, + "num_tokens": 5113899.0, + "reward": 2.8031535148620605, + "reward_std": 0.21395450830459595, + "rewards/reward_fn/mean": 2.8031535148620605, + "rewards/reward_fn/std": 0.21395452320575714, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.011138220006364698, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.02260214788839221, + "learning_rate": 7.9584e-06, + "loss": -0.1233, + "num_tokens": 5136651.0, + "reward": 3.3392744064331055, + "reward_std": 0.32503390312194824, + "rewards/reward_fn/mean": 3.3392744064331055, + "rewards/reward_fn/std": 0.32503387331962585, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 286.78125, + "completions/mean_terminated_length": 229.9677276611328, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.011244298292139599, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.022181879496201873, + "learning_rate": 7.957999999999999e-06, + "loss": 0.1931, + "num_tokens": 5181188.0, + "reward": 3.0173768997192383, + "reward_std": 0.9120379686355591, + "rewards/reward_fn/mean": 3.0173768997192383, + "rewards/reward_fn/std": 0.9120379686355591, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 236.4375, + "completions/mean_terminated_length": 236.4375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.011350376577914502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01876313250977546, + "learning_rate": 7.9576e-06, + "loss": 0.0321, + "num_tokens": 5225106.0, + "reward": 3.034740924835205, + "reward_std": 0.38066956400871277, + "rewards/reward_fn/mean": 3.034740924835205, + "rewards/reward_fn/std": 0.38066956400871277, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 72.40625, + "completions/mean_terminated_length": 72.40625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.011456454863689403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.318359375, + "kl": 0.02579229767434299, + "learning_rate": 7.9572e-06, + "loss": 0.001, + "num_tokens": 5262239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1813.0, + "completions/mean_length": 1036.21875, + "completions/mean_terminated_length": 891.6785888671875, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.011562533149464305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.008434178889729083, + "learning_rate": 7.9568e-06, + "loss": 0.215, + "num_tokens": 5321798.0, + "reward": 1.9139066934585571, + "reward_std": 0.8915742635726929, + "rewards/reward_fn/mean": 1.9139066934585571, + "rewards/reward_fn/std": 0.8915743231773376, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 330.4375, + "completions/mean_terminated_length": 330.4375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.011668611435239206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.013277154648676515, + "learning_rate": 7.9564e-06, + "loss": 0.0005, + "num_tokens": 5368276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 427.03125, + "completions/mean_terminated_length": 374.7419128417969, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.01177468972101411, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01534812489990145, + "learning_rate": 7.956e-06, + "loss": 0.0793, + "num_tokens": 5413205.0, + "reward": 3.095240592956543, + "reward_std": 1.05535888671875, + "rewards/reward_fn/mean": 3.095240592956543, + "rewards/reward_fn/std": 1.05535888671875, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 293.96875, + "completions/mean_terminated_length": 293.96875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.01188076800678901, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.017015320365317166, + "learning_rate": 7.955599999999999e-06, + "loss": 0.0992, + "num_tokens": 5466292.0, + "reward": 2.8189077377319336, + "reward_std": 0.3456776738166809, + "rewards/reward_fn/mean": 2.8189077377319336, + "rewards/reward_fn/std": 0.3456777334213257, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 502.71875, + "completions/mean_terminated_length": 452.8709411621094, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.011986846292563913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.012012695078738034, + "learning_rate": 7.9552e-06, + "loss": 0.2114, + "num_tokens": 5520395.0, + "reward": 3.8103599548339844, + "reward_std": 0.5348771214485168, + "rewards/reward_fn/mean": 3.8103599548339844, + "rewards/reward_fn/std": 0.5348771810531616, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 134.46875, + "completions/mean_terminated_length": 134.46875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.012092924578338814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.018360009300522506, + "learning_rate": 7.954799999999999e-06, + "loss": 0.0659, + "num_tokens": 5561658.0, + "reward": 2.972559690475464, + "reward_std": 0.11098479479551315, + "rewards/reward_fn/mean": 2.972559690475464, + "rewards/reward_fn/std": 0.11098476499319077, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1066.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 480.03125, + "completions/mean_terminated_length": 480.03125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.012199002864113715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.01176215277519077, + "learning_rate": 7.9544e-06, + "loss": 0.0591, + "num_tokens": 5620795.0, + "reward": 2.763178825378418, + "reward_std": 0.8988648653030396, + "rewards/reward_fn/mean": 2.763178825378418, + "rewards/reward_fn/std": 0.8988648653030396, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1965.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 537.3125, + "completions/mean_terminated_length": 537.3125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.012305081149888618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.011784568894654512, + "learning_rate": 7.953999999999999e-06, + "loss": 0.0149, + "num_tokens": 5675109.0, + "reward": 2.534766674041748, + "reward_std": 0.42342138290405273, + "rewards/reward_fn/mean": 2.534766674041748, + "rewards/reward_fn/std": 0.42342138290405273, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 521.15625, + "completions/mean_terminated_length": 471.9031982421875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.012411159435663519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.012846595840528607, + "learning_rate": 7.9536e-06, + "loss": 0.1451, + "num_tokens": 5734314.0, + "reward": 2.8016586303710938, + "reward_std": 0.7866551280021667, + "rewards/reward_fn/mean": 2.8016586303710938, + "rewards/reward_fn/std": 0.786655068397522, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 257.0625, + "completions/mean_terminated_length": 257.0625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.012517237721438422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.015249886200763285, + "learning_rate": 7.953199999999999e-06, + "loss": -0.0325, + "num_tokens": 5776108.0, + "reward": 2.9798991680145264, + "reward_std": 0.4480231702327728, + "rewards/reward_fn/mean": 2.9798991680145264, + "rewards/reward_fn/std": 0.44802314043045044, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 494.28125, + "completions/mean_terminated_length": 494.28125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.012623316007213323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0132115458836779, + "learning_rate": 7.9528e-06, + "loss": 0.1698, + "num_tokens": 5824181.0, + "reward": 2.8119354248046875, + "reward_std": 0.21195653080940247, + "rewards/reward_fn/mean": 2.8119354248046875, + "rewards/reward_fn/std": 0.21195654571056366, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 195.0625, + "completions/mean_terminated_length": 195.0625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.012729394292988225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.017341266619041562, + "learning_rate": 7.9524e-06, + "loss": 0.0007, + "num_tokens": 5861367.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 320.6875, + "completions/mean_terminated_length": 264.9677429199219, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.012835472578763127, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.0184562795329839, + "learning_rate": 7.952e-06, + "loss": 0.3139, + "num_tokens": 5906125.0, + "reward": 3.835097312927246, + "reward_std": 0.7352915406227112, + "rewards/reward_fn/mean": 3.835097312927246, + "rewards/reward_fn/std": 0.7352915406227112, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 272.3125, + "completions/mean_terminated_length": 272.3125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.01294155086453803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.015007280395366251, + "learning_rate": 7.9516e-06, + "loss": 0.0006, + "num_tokens": 5931671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 192.625, + "completions/mean_terminated_length": 192.625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.01304762915031293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.017203714582137764, + "learning_rate": 7.9512e-06, + "loss": 0.0007, + "num_tokens": 5973867.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 295.5625, + "completions/mean_terminated_length": 295.5625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.013153707436087833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.015582025167532265, + "learning_rate": 7.9508e-06, + "loss": 0.0483, + "num_tokens": 6008669.0, + "reward": 3.4704294204711914, + "reward_std": 0.8120108246803284, + "rewards/reward_fn/mean": 3.4704294204711914, + "rewards/reward_fn/std": 0.8120108246803284, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 371.4375, + "completions/mean_terminated_length": 371.4375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.013259785721862734, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.01567537139635533, + "learning_rate": 7.9504e-06, + "loss": 0.275, + "num_tokens": 6067755.0, + "reward": 3.2702126502990723, + "reward_std": 0.5275013446807861, + "rewards/reward_fn/mean": 3.2702126502990723, + "rewards/reward_fn/std": 0.5275014042854309, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 288.4375, + "completions/mean_terminated_length": 288.4375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.013365864007637637, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.015237941057421267, + "learning_rate": 7.95e-06, + "loss": -0.0294, + "num_tokens": 6137497.0, + "reward": 2.747013568878174, + "reward_std": 0.3507872223854065, + "rewards/reward_fn/mean": 2.747013568878174, + "rewards/reward_fn/std": 0.3507872223854065, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.013471942293412538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.017273012548685074, + "learning_rate": 7.9496e-06, + "loss": 0.0007, + "num_tokens": 6171641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1208.0, + "completions/mean_length": 364.5, + "completions/mean_terminated_length": 310.19354248046875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.01357802057918744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.016130903968587518, + "learning_rate": 7.9492e-06, + "loss": 0.2309, + "num_tokens": 6221065.0, + "reward": 3.2581684589385986, + "reward_std": 0.7389606833457947, + "rewards/reward_fn/mean": 3.2581684589385986, + "rewards/reward_fn/std": 0.7389606833457947, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 158.875, + "completions/mean_terminated_length": 158.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.013684098864962342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.020700134336948395, + "learning_rate": 7.9488e-06, + "loss": 0.0008, + "num_tokens": 6257413.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 178.09375, + "completions/mean_terminated_length": 178.09375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.013790177150737245, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.015618887031450868, + "learning_rate": 7.9484e-06, + "loss": 0.044, + "num_tokens": 6303144.0, + "reward": 3.203434944152832, + "reward_std": 0.39366769790649414, + "rewards/reward_fn/mean": 3.203434944152832, + "rewards/reward_fn/std": 0.39366772770881653, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1560.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 206.5625, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.013896255436512146, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.020754429628141224, + "learning_rate": 7.948e-06, + "loss": 0.0881, + "num_tokens": 6349914.0, + "reward": 3.9251515865325928, + "reward_std": 0.42340630292892456, + "rewards/reward_fn/mean": 3.9251515865325928, + "rewards/reward_fn/std": 0.42340633273124695, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 586.65625, + "completions/mean_terminated_length": 539.51611328125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.014002333722287048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.016077990061603487, + "learning_rate": 7.9476e-06, + "loss": 0.1381, + "num_tokens": 6405231.0, + "reward": 2.754066228866577, + "reward_std": 0.8537517189979553, + "rewards/reward_fn/mean": 2.754066228866577, + "rewards/reward_fn/std": 0.8537517189979553, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 368.90625, + "completions/mean_terminated_length": 368.90625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.01410841200806195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.021284001879394054, + "learning_rate": 7.947199999999999e-06, + "loss": -0.0086, + "num_tokens": 6432524.0, + "reward": 2.1652965545654297, + "reward_std": 0.8815619945526123, + "rewards/reward_fn/mean": 2.1652965545654297, + "rewards/reward_fn/std": 0.8815619945526123, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 113.1875, + "completions/mean_terminated_length": 113.1875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.014214490293836852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.020680560497567058, + "learning_rate": 7.9468e-06, + "loss": 0.0047, + "num_tokens": 6471026.0, + "reward": 3.8417961597442627, + "reward_std": 0.37386056780815125, + "rewards/reward_fn/mean": 3.8417961597442627, + "rewards/reward_fn/std": 0.37386056780815125, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.014320568579611753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.023233756190165877, + "learning_rate": 7.946399999999999e-06, + "loss": 0.0785, + "num_tokens": 6510890.0, + "reward": 2.3463985919952393, + "reward_std": 0.588642418384552, + "rewards/reward_fn/mean": 2.3463985919952393, + "rewards/reward_fn/std": 0.5886423587799072, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 202.375, + "completions/mean_terminated_length": 202.375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.014426646865386656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.02503801044076681, + "learning_rate": 7.946e-06, + "loss": 0.001, + "num_tokens": 6554230.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 234.71875, + "completions/mean_terminated_length": 234.71875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.014532725151161557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.024327925639227033, + "learning_rate": 7.945599999999999e-06, + "loss": 0.0013, + "num_tokens": 6596909.0, + "reward": 3.922415256500244, + "reward_std": 0.3053688704967499, + "rewards/reward_fn/mean": 3.922415256500244, + "rewards/reward_fn/std": 0.3053688704967499, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 459.625, + "completions/mean_terminated_length": 459.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.01463880343693646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.018823521910235286, + "learning_rate": 7.9452e-06, + "loss": -0.0468, + "num_tokens": 6642881.0, + "reward": 2.581930637359619, + "reward_std": 0.4790569543838501, + "rewards/reward_fn/mean": 2.581930637359619, + "rewards/reward_fn/std": 0.4790569543838501, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 382.875, + "completions/mean_terminated_length": 382.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.01474488172271136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.018572077504359186, + "learning_rate": 7.944799999999999e-06, + "loss": 0.0251, + "num_tokens": 6690941.0, + "reward": 2.7523913383483887, + "reward_std": 0.3522571325302124, + "rewards/reward_fn/mean": 2.7523913383483887, + "rewards/reward_fn/std": 0.3522571325302124, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 182.21875, + "completions/mean_terminated_length": 182.21875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.014850960008486264, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.030267908703535795, + "learning_rate": 7.9444e-06, + "loss": 0.0821, + "num_tokens": 6738724.0, + "reward": 3.928792953491211, + "reward_std": 0.28060102462768555, + "rewards/reward_fn/mean": 3.928792953491211, + "rewards/reward_fn/std": 0.28060105443000793, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 132.21875, + "completions/mean_terminated_length": 132.21875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.014957038294261165, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.024672939674928784, + "learning_rate": 7.943999999999999e-06, + "loss": 0.0751, + "num_tokens": 6791531.0, + "reward": 3.088392734527588, + "reward_std": 0.07315707951784134, + "rewards/reward_fn/mean": 3.088392734527588, + "rewards/reward_fn/std": 0.07315707206726074, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 516.71875, + "completions/mean_terminated_length": 467.32257080078125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.015063116580036067, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02176033239811659, + "learning_rate": 7.9436e-06, + "loss": 0.1231, + "num_tokens": 6842882.0, + "reward": 2.3468077182769775, + "reward_std": 0.5988110303878784, + "rewards/reward_fn/mean": 2.3468077182769775, + "rewards/reward_fn/std": 0.5988109707832336, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 387.9375, + "completions/mean_terminated_length": 387.9375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.015169194865810968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.019691342720761895, + "learning_rate": 7.9432e-06, + "loss": -0.0128, + "num_tokens": 6889664.0, + "reward": 2.8758623600006104, + "reward_std": 0.4385010898113251, + "rewards/reward_fn/mean": 2.8758623600006104, + "rewards/reward_fn/std": 0.4385010302066803, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 178.21875, + "completions/mean_terminated_length": 178.21875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.01527527315158587, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.02775087859481573, + "learning_rate": 7.9428e-06, + "loss": -0.0002, + "num_tokens": 6924935.0, + "reward": 3.926584482192993, + "reward_std": 0.4153006076812744, + "rewards/reward_fn/mean": 3.926584482192993, + "rewards/reward_fn/std": 0.4153006076812744, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 266.15625, + "completions/mean_terminated_length": 266.15625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.015381351437360772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.018933446379378438, + "learning_rate": 7.9424e-06, + "loss": 0.0199, + "num_tokens": 6971724.0, + "reward": 3.3563008308410645, + "reward_std": 0.718804121017456, + "rewards/reward_fn/mean": 3.3563008308410645, + "rewards/reward_fn/std": 0.7188041806221008, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 217.125, + "completions/mean_terminated_length": 217.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.015487429723135673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.022704745642840862, + "learning_rate": 7.942e-06, + "loss": -0.0131, + "num_tokens": 7016752.0, + "reward": 3.190145492553711, + "reward_std": 0.38979148864746094, + "rewards/reward_fn/mean": 3.190145492553711, + "rewards/reward_fn/std": 0.3897915184497833, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 236.46875, + "completions/mean_terminated_length": 236.46875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.015593508008910576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.02000652300193906, + "learning_rate": 7.9416e-06, + "loss": 0.0635, + "num_tokens": 7061183.0, + "reward": 2.778085947036743, + "reward_std": 0.029853839427232742, + "rewards/reward_fn/mean": 2.778085947036743, + "rewards/reward_fn/std": 0.029853837564587593, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 550.25, + "completions/mean_terminated_length": 501.9354553222656, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.01569958629468548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01610812882427126, + "learning_rate": 7.9412e-06, + "loss": 0.21, + "num_tokens": 7120551.0, + "reward": 2.2802534103393555, + "reward_std": 0.6225919127464294, + "rewards/reward_fn/mean": 2.2802534103393555, + "rewards/reward_fn/std": 0.6225919127464294, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 143.875, + "completions/mean_terminated_length": 143.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.015805664580460378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2138671875, + "kl": 0.03147526946850121, + "learning_rate": 7.9408e-06, + "loss": 0.0013, + "num_tokens": 7158307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 154.375, + "completions/mean_terminated_length": 154.375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.01591174286623528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.0251955462154001, + "learning_rate": 7.9404e-06, + "loss": 0.001, + "num_tokens": 7191407.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 313.90625, + "completions/mean_terminated_length": 313.90625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.016017821152010184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.022513336036354303, + "learning_rate": 7.94e-06, + "loss": 0.091, + "num_tokens": 7234572.0, + "reward": 3.6232008934020996, + "reward_std": 0.6518265008926392, + "rewards/reward_fn/mean": 3.6232008934020996, + "rewards/reward_fn/std": 0.6518264412879944, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 531.8125, + "completions/mean_terminated_length": 482.9031982421875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.016123899437785087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.020188245456665754, + "learning_rate": 7.9396e-06, + "loss": 0.2049, + "num_tokens": 7288454.0, + "reward": 2.797072410583496, + "reward_std": 0.6213882565498352, + "rewards/reward_fn/mean": 2.797072410583496, + "rewards/reward_fn/std": 0.6213882565498352, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 268.0, + "completions/mean_terminated_length": 268.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.016229977723559986, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.021411948837339878, + "learning_rate": 7.939199999999998e-06, + "loss": 0.2013, + "num_tokens": 7330694.0, + "reward": 3.8916516304016113, + "reward_std": 0.44769445061683655, + "rewards/reward_fn/mean": 3.8916516304016113, + "rewards/reward_fn/std": 0.44769442081451416, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.01633605600933489, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02316540782339871, + "learning_rate": 7.9388e-06, + "loss": 0.0752, + "num_tokens": 7365740.0, + "reward": 2.775660753250122, + "reward_std": 0.2788248360157013, + "rewards/reward_fn/mean": 2.775660753250122, + "rewards/reward_fn/std": 0.2788248360157013, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.01644213429510979, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.023652152274735272, + "learning_rate": 7.9384e-06, + "loss": 0.1038, + "num_tokens": 7415170.0, + "reward": 3.3111374378204346, + "reward_std": 0.525631308555603, + "rewards/reward_fn/mean": 3.3111374378204346, + "rewards/reward_fn/std": 0.525631308555603, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 357.09375, + "completions/mean_terminated_length": 302.5483703613281, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.016548212580884694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01592431077733636, + "learning_rate": 7.938e-06, + "loss": 0.2131, + "num_tokens": 7480837.0, + "reward": 3.85792875289917, + "reward_std": 0.5590457320213318, + "rewards/reward_fn/mean": 3.85792875289917, + "rewards/reward_fn/std": 0.559045672416687, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 130.0625, + "completions/mean_terminated_length": 130.0625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.016654290866659593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.018957374268211424, + "learning_rate": 7.9376e-06, + "loss": 0.0008, + "num_tokens": 7512743.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 118.5625, + "completions/mean_terminated_length": 118.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.016760369152434496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.02136626502033323, + "learning_rate": 7.9372e-06, + "loss": 0.0009, + "num_tokens": 7547385.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 287.78125, + "completions/mean_terminated_length": 287.78125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.0168664474382094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.022316091926768422, + "learning_rate": 7.9368e-06, + "loss": 0.0682, + "num_tokens": 7590034.0, + "reward": 2.9934346675872803, + "reward_std": 0.05143573135137558, + "rewards/reward_fn/mean": 2.9934346675872803, + "rewards/reward_fn/std": 0.05143576115369797, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 363.21875, + "completions/mean_terminated_length": 363.21875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.016972525723984302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.019301145221106708, + "learning_rate": 7.936399999999999e-06, + "loss": 0.0424, + "num_tokens": 7631033.0, + "reward": 2.5955896377563477, + "reward_std": 0.4019843637943268, + "rewards/reward_fn/mean": 2.5955896377563477, + "rewards/reward_fn/std": 0.40198445320129395, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 253.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0170786040097592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.023140242788940668, + "learning_rate": 7.936e-06, + "loss": -0.2224, + "num_tokens": 7667917.0, + "reward": 1.8273133039474487, + "reward_std": 0.1830357313156128, + "rewards/reward_fn/mean": 1.8273133039474487, + "rewards/reward_fn/std": 0.1830357015132904, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 290.78125, + "completions/mean_terminated_length": 290.78125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.017184682295534104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.018885845551267266, + "learning_rate": 7.935599999999999e-06, + "loss": -0.072, + "num_tokens": 7708902.0, + "reward": 2.6470470428466797, + "reward_std": 0.04726897180080414, + "rewards/reward_fn/mean": 2.6470470428466797, + "rewards/reward_fn/std": 0.04726899042725563, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 121.34375, + "completions/mean_terminated_length": 121.34375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.017290760581309007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.014050235971808434, + "learning_rate": 7.9352e-06, + "loss": 0.0006, + "num_tokens": 7754129.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 308.71875, + "completions/mean_terminated_length": 308.71875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.01739683886708391, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.015304000582545996, + "learning_rate": 7.934799999999999e-06, + "loss": -0.0026, + "num_tokens": 7799112.0, + "reward": 3.7717292308807373, + "reward_std": 0.6454416513442993, + "rewards/reward_fn/mean": 3.7717292308807373, + "rewards/reward_fn/std": 0.6454416513442993, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 252.4375, + "completions/mean_terminated_length": 252.4375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.01750291715285881, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02483147452585399, + "learning_rate": 7.9344e-06, + "loss": -0.0213, + "num_tokens": 7819094.0, + "reward": 3.130520820617676, + "reward_std": 0.9921421408653259, + "rewards/reward_fn/mean": 3.130520820617676, + "rewards/reward_fn/std": 0.9921420812606812, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.01760899543863371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.022684934083372355, + "learning_rate": 7.934e-06, + "loss": 0.1033, + "num_tokens": 7873542.0, + "reward": 2.9395103454589844, + "reward_std": 0.25097665190696716, + "rewards/reward_fn/mean": 2.9395103454589844, + "rewards/reward_fn/std": 0.2509766221046448, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 96.8125, + "completions/mean_terminated_length": 96.8125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.017715073724408614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.020881250500679016, + "learning_rate": 7.9336e-06, + "loss": 0.0008, + "num_tokens": 7918592.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.017821152010183517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.01776517287362367, + "learning_rate": 7.9332e-06, + "loss": 0.2052, + "num_tokens": 7965880.0, + "reward": 2.8560633659362793, + "reward_std": 0.4875142276287079, + "rewards/reward_fn/mean": 2.8560633659362793, + "rewards/reward_fn/std": 0.4875142276287079, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 256.875, + "completions/mean_terminated_length": 256.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.017927230295958416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.022131944191642106, + "learning_rate": 7.9328e-06, + "loss": -0.0649, + "num_tokens": 8018580.0, + "reward": 3.7245075702667236, + "reward_std": 0.8427301645278931, + "rewards/reward_fn/mean": 3.7245075702667236, + "rewards/reward_fn/std": 0.8427301645278931, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 441.9375, + "completions/mean_terminated_length": 441.9375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.01803330858173332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.01810378080699593, + "learning_rate": 7.9324e-06, + "loss": -0.0885, + "num_tokens": 8068722.0, + "reward": 2.511117458343506, + "reward_std": 0.7322016954421997, + "rewards/reward_fn/mean": 2.511117458343506, + "rewards/reward_fn/std": 0.7322016954421997, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 209.9375, + "completions/mean_terminated_length": 209.9375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.018139386867508222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.024077138165012002, + "learning_rate": 7.932e-06, + "loss": -0.0054, + "num_tokens": 8116656.0, + "reward": 3.855273723602295, + "reward_std": 0.48421603441238403, + "rewards/reward_fn/mean": 3.855273723602295, + "rewards/reward_fn/std": 0.48421603441238403, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 135.625, + "completions/mean_terminated_length": 135.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.01824546515328312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.025709635578095913, + "learning_rate": 7.9316e-06, + "loss": 0.001, + "num_tokens": 8152516.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 241.5, + "completions/mean_terminated_length": 241.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.018351543439058024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.02147727902047336, + "learning_rate": 7.9312e-06, + "loss": -0.0254, + "num_tokens": 8200052.0, + "reward": 3.6303017139434814, + "reward_std": 0.5629006028175354, + "rewards/reward_fn/mean": 3.6303017139434814, + "rewards/reward_fn/std": 0.5629005432128906, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 215.0625, + "completions/mean_terminated_length": 215.0625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.018457621724832927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.015550489188171923, + "learning_rate": 7.930799999999999e-06, + "loss": -0.0056, + "num_tokens": 8245206.0, + "reward": 3.894169330596924, + "reward_std": 0.4359620213508606, + "rewards/reward_fn/mean": 3.894169330596924, + "rewards/reward_fn/std": 0.4359620213508606, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 182.34375, + "completions/mean_terminated_length": 182.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.01856370001060783, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.018534490489400923, + "learning_rate": 7.9304e-06, + "loss": -0.0642, + "num_tokens": 8285089.0, + "reward": 3.3171892166137695, + "reward_std": 0.22977031767368317, + "rewards/reward_fn/mean": 3.3171892166137695, + "rewards/reward_fn/std": 0.2297702431678772, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 158.03125, + "completions/mean_terminated_length": 158.03125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.01866977829638273, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.01954884792212397, + "learning_rate": 7.929999999999999e-06, + "loss": -0.029, + "num_tokens": 8314050.0, + "reward": 3.89382266998291, + "reward_std": 0.33542728424072266, + "rewards/reward_fn/mean": 3.89382266998291, + "rewards/reward_fn/std": 0.33542731404304504, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 82.28125, + "completions/mean_terminated_length": 82.28125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.01877585658215763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1689453125, + "kl": 0.02278315497096628, + "learning_rate": 7.9296e-06, + "loss": 0.0009, + "num_tokens": 8348715.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 213.3125, + "completions/mean_terminated_length": 213.3125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.018881934867932534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.020352299557998776, + "learning_rate": 7.9292e-06, + "loss": 0.0348, + "num_tokens": 8389077.0, + "reward": 3.964296340942383, + "reward_std": 0.2019711285829544, + "rewards/reward_fn/mean": 3.964296340942383, + "rewards/reward_fn/std": 0.2019711583852768, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 245.90625, + "completions/mean_terminated_length": 245.90625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.018988013153707437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.01580990757793188, + "learning_rate": 7.9288e-06, + "loss": -0.0487, + "num_tokens": 8435986.0, + "reward": 3.861480712890625, + "reward_std": 0.545066773891449, + "rewards/reward_fn/mean": 3.861480712890625, + "rewards/reward_fn/std": 0.545066773891449, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 145.4375, + "completions/mean_terminated_length": 145.4375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.019094091439482336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.01902392355259508, + "learning_rate": 7.9284e-06, + "loss": 0.0008, + "num_tokens": 8471936.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 184.25, + "completions/mean_terminated_length": 184.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.01920016972525724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.022345099016092718, + "learning_rate": 7.928e-06, + "loss": -0.0118, + "num_tokens": 8517768.0, + "reward": 3.9072141647338867, + "reward_std": 0.2347065657377243, + "rewards/reward_fn/mean": 3.9072141647338867, + "rewards/reward_fn/std": 0.2347065508365631, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 170.59375, + "completions/mean_terminated_length": 170.59375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.019306248011032142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.022710592485964298, + "learning_rate": 7.9276e-06, + "loss": -0.0182, + "num_tokens": 8545371.0, + "reward": 3.7901389598846436, + "reward_std": 0.6630701422691345, + "rewards/reward_fn/mean": 3.7901389598846436, + "rewards/reward_fn/std": 0.6630700826644897, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 465.59375, + "completions/mean_terminated_length": 465.59375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.019412326296807045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.017950538313016295, + "learning_rate": 7.9272e-06, + "loss": 0.1813, + "num_tokens": 8598190.0, + "reward": 2.7570290565490723, + "reward_std": 0.33804285526275635, + "rewards/reward_fn/mean": 2.7570290565490723, + "rewards/reward_fn/std": 0.33804285526275635, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 183.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.019518404582581944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.021848752396181226, + "learning_rate": 7.9268e-06, + "loss": -0.0899, + "num_tokens": 8644410.0, + "reward": 3.5412044525146484, + "reward_std": 0.5812621116638184, + "rewards/reward_fn/mean": 3.5412044525146484, + "rewards/reward_fn/std": 0.5812621116638184, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 315.0, + "completions/mean_terminated_length": 315.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.019624482868356847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.017826329451054335, + "learning_rate": 7.9264e-06, + "loss": 0.0526, + "num_tokens": 8683802.0, + "reward": 2.4369404315948486, + "reward_std": 0.47755300998687744, + "rewards/reward_fn/mean": 2.4369404315948486, + "rewards/reward_fn/std": 0.47755295038223267, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 287.75, + "completions/mean_terminated_length": 287.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.01973056115413175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.015661925077438354, + "learning_rate": 7.926e-06, + "loss": -0.0674, + "num_tokens": 8740626.0, + "reward": 2.8808302879333496, + "reward_std": 0.07162141054868698, + "rewards/reward_fn/mean": 2.8808302879333496, + "rewards/reward_fn/std": 0.07162139564752579, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 237.90625, + "completions/mean_terminated_length": 237.90625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.019836639439906652, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.025629171170294285, + "learning_rate": 7.925599999999999e-06, + "loss": 0.0427, + "num_tokens": 8785615.0, + "reward": 3.9353818893432617, + "reward_std": 0.25444263219833374, + "rewards/reward_fn/mean": 3.9353818893432617, + "rewards/reward_fn/std": 0.25444263219833374, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 217.75, + "completions/mean_terminated_length": 217.75, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.019942717725681552, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.023751557571813464, + "learning_rate": 7.9252e-06, + "loss": 0.0619, + "num_tokens": 8826823.0, + "reward": 3.165792465209961, + "reward_std": 0.08262227475643158, + "rewards/reward_fn/mean": 3.165792465209961, + "rewards/reward_fn/std": 0.08262225985527039, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 357.375, + "completions/mean_terminated_length": 357.375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.020048796011456455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.014718591351993382, + "learning_rate": 7.9248e-06, + "loss": -0.0545, + "num_tokens": 8860371.0, + "reward": 3.8873002529144287, + "reward_std": 0.35631558299064636, + "rewards/reward_fn/mean": 3.8873002529144287, + "rewards/reward_fn/std": 0.35631558299064636, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 393.5625, + "completions/mean_terminated_length": 393.5625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.020154874297231357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01423095993231982, + "learning_rate": 7.9244e-06, + "loss": 0.0451, + "num_tokens": 8908069.0, + "reward": 2.73614239692688, + "reward_std": 0.17838501930236816, + "rewards/reward_fn/mean": 2.73614239692688, + "rewards/reward_fn/std": 0.17838500440120697, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1880.0, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 583.5625, + "completions/mean_terminated_length": 583.5625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.02026095258300626, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.014432973344810307, + "learning_rate": 7.924e-06, + "loss": -0.0538, + "num_tokens": 8964215.0, + "reward": 2.671628713607788, + "reward_std": 0.659324586391449, + "rewards/reward_fn/mean": 2.671628713607788, + "rewards/reward_fn/std": 0.659324586391449, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 149.53125, + "completions/mean_terminated_length": 149.53125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.02036703086878116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.016666988376528025, + "learning_rate": 7.9236e-06, + "loss": 0.0007, + "num_tokens": 9010280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.020473109154556062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.022206315770745277, + "learning_rate": 7.923199999999999e-06, + "loss": 0.0009, + "num_tokens": 9037280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 176.96875, + "completions/mean_terminated_length": 176.96875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.020579187440330965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.023108911118470132, + "learning_rate": 7.9228e-06, + "loss": 0.0009, + "num_tokens": 9063647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 92.46875, + "completions/mean_terminated_length": 92.46875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.020685265726105868, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.022209690301679075, + "learning_rate": 7.922399999999999e-06, + "loss": 0.0086, + "num_tokens": 9101102.0, + "reward": 3.0705783367156982, + "reward_std": 0.04061302915215492, + "rewards/reward_fn/mean": 3.0705783367156982, + "rewards/reward_fn/std": 0.04061301052570343, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 302.21875, + "completions/mean_terminated_length": 302.21875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.020791344011880767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.019014439545571804, + "learning_rate": 7.922e-06, + "loss": 0.0131, + "num_tokens": 9131189.0, + "reward": 3.8880209922790527, + "reward_std": 0.35388144850730896, + "rewards/reward_fn/mean": 3.8880209922790527, + "rewards/reward_fn/std": 0.3538813889026642, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 282.78125, + "completions/mean_terminated_length": 282.78125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.02089742229765567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.01936782174743712, + "learning_rate": 7.921599999999999e-06, + "loss": 0.0349, + "num_tokens": 9180334.0, + "reward": 2.780134677886963, + "reward_std": 0.2310194969177246, + "rewards/reward_fn/mean": 2.780134677886963, + "rewards/reward_fn/std": 0.231019526720047, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 101.21875, + "completions/mean_terminated_length": 101.21875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.021003500583430573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.181640625, + "kl": 0.0338819632306695, + "learning_rate": 7.9212e-06, + "loss": 0.0014, + "num_tokens": 9226933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 237.3125, + "completions/mean_terminated_length": 237.3125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.021109578869205475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0204662608448416, + "learning_rate": 7.920799999999999e-06, + "loss": 0.0692, + "num_tokens": 9264927.0, + "reward": 3.256192207336426, + "reward_std": 0.5834986567497253, + "rewards/reward_fn/mean": 3.256192207336426, + "rewards/reward_fn/std": 0.5834985971450806, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 245.125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.021215657154980375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.021983513375744224, + "learning_rate": 7.9204e-06, + "loss": 0.0059, + "num_tokens": 9310723.0, + "reward": 3.7263994216918945, + "reward_std": 0.7355256080627441, + "rewards/reward_fn/mean": 3.7263994216918945, + "rewards/reward_fn/std": 0.7355256080627441, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1724.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 385.3125, + "completions/mean_terminated_length": 385.3125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.021321735440755277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.017621309030801058, + "learning_rate": 7.92e-06, + "loss": -0.0468, + "num_tokens": 9355341.0, + "reward": 3.0219438076019287, + "reward_std": 0.6457244157791138, + "rewards/reward_fn/mean": 3.0219438076019287, + "rewards/reward_fn/std": 0.6457244157791138, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1825.0, + "completions/max_terminated_length": 1825.0, + "completions/mean_length": 460.375, + "completions/mean_terminated_length": 460.375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.02142781372653018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.017469807295128703, + "learning_rate": 7.9196e-06, + "loss": 0.0556, + "num_tokens": 9411481.0, + "reward": 2.799130439758301, + "reward_std": 0.6681373715400696, + "rewards/reward_fn/mean": 2.799130439758301, + "rewards/reward_fn/std": 0.6681373715400696, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 471.875, + "completions/mean_terminated_length": 471.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.02153389201230508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.014571573003195226, + "learning_rate": 7.9192e-06, + "loss": -0.0191, + "num_tokens": 9467445.0, + "reward": 2.55147647857666, + "reward_std": 0.5883057117462158, + "rewards/reward_fn/mean": 2.55147647857666, + "rewards/reward_fn/std": 0.5883057117462158, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 377.40625, + "completions/mean_terminated_length": 377.40625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.021639970298079982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.020812442991882563, + "learning_rate": 7.9188e-06, + "loss": 0.0307, + "num_tokens": 9511202.0, + "reward": 3.7836437225341797, + "reward_std": 0.42160654067993164, + "rewards/reward_fn/mean": 3.7836437225341797, + "rewards/reward_fn/std": 0.4216066002845764, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 341.8125, + "completions/mean_terminated_length": 341.8125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.021746048583854885, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.019275061087682843, + "learning_rate": 7.9184e-06, + "loss": 0.009, + "num_tokens": 9543420.0, + "reward": 3.574246644973755, + "reward_std": 0.8010706901550293, + "rewards/reward_fn/mean": 3.574246644973755, + "rewards/reward_fn/std": 0.8010706901550293, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 359.125, + "completions/mean_terminated_length": 359.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.021852126869629788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.020105792675167322, + "learning_rate": 7.918e-06, + "loss": 0.0532, + "num_tokens": 9602560.0, + "reward": 3.616807460784912, + "reward_std": 0.6793028712272644, + "rewards/reward_fn/mean": 3.616807460784912, + "rewards/reward_fn/std": 0.6793028116226196, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 326.9375, + "completions/mean_terminated_length": 326.9375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.021958205155404687, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.0190516859292984, + "learning_rate": 7.9176e-06, + "loss": 0.0137, + "num_tokens": 9643486.0, + "reward": 3.9254260063171387, + "reward_std": 0.4218538701534271, + "rewards/reward_fn/mean": 3.9254260063171387, + "rewards/reward_fn/std": 0.42185384035110474, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 585.125, + "completions/mean_terminated_length": 585.125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.02206428344117959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.013375958427786827, + "learning_rate": 7.9172e-06, + "loss": -0.0082, + "num_tokens": 9693762.0, + "reward": 3.8153269290924072, + "reward_std": 0.6042912602424622, + "rewards/reward_fn/mean": 3.8153269290924072, + "rewards/reward_fn/std": 0.6042913198471069, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 222.5625, + "completions/mean_terminated_length": 222.5625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.022170361726954493, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.021646051667630672, + "learning_rate": 7.9168e-06, + "loss": -0.1015, + "num_tokens": 9732404.0, + "reward": 3.488537311553955, + "reward_std": 0.6231763958930969, + "rewards/reward_fn/mean": 3.488537311553955, + "rewards/reward_fn/std": 0.6231764554977417, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 450.4375, + "completions/mean_terminated_length": 398.9032287597656, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.022276440012729395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.01734934072010219, + "learning_rate": 7.9164e-06, + "loss": 0.0922, + "num_tokens": 9783682.0, + "reward": 3.4365906715393066, + "reward_std": 1.0417983531951904, + "rewards/reward_fn/mean": 3.4365906715393066, + "rewards/reward_fn/std": 1.0417983531951904, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 106.09375, + "completions/mean_terminated_length": 106.09375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.022382518298504295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.02026362717151642, + "learning_rate": 7.916e-06, + "loss": 0.0008, + "num_tokens": 9824517.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 200.875, + "completions/mean_terminated_length": 200.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.022488596584279198, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.021493054926395416, + "learning_rate": 7.9156e-06, + "loss": -0.0043, + "num_tokens": 9866497.0, + "reward": 3.8106346130371094, + "reward_std": 0.6165704727172852, + "rewards/reward_fn/mean": 3.8106346130371094, + "rewards/reward_fn/std": 0.6165704131126404, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 247.1875, + "completions/mean_terminated_length": 247.1875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.0225946748700541, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.022569065680727363, + "learning_rate": 7.9152e-06, + "loss": 0.0482, + "num_tokens": 9910567.0, + "reward": 3.931674003601074, + "reward_std": 0.38650935888290405, + "rewards/reward_fn/mean": 3.931674003601074, + "rewards/reward_fn/std": 0.3865092992782593, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1417.0, + "completions/max_terminated_length": 1417.0, + "completions/mean_length": 516.78125, + "completions/mean_terminated_length": 516.78125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.022700753155829003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.015229119802825153, + "learning_rate": 7.9148e-06, + "loss": -0.001, + "num_tokens": 9961536.0, + "reward": 2.6889023780822754, + "reward_std": 0.4305468201637268, + "rewards/reward_fn/mean": 2.6889023780822754, + "rewards/reward_fn/std": 0.4305467903614044, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 286.71875, + "completions/mean_terminated_length": 286.71875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.022806831441603902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.019226033822633326, + "learning_rate": 7.9144e-06, + "loss": -0.0184, + "num_tokens": 10009143.0, + "reward": 3.134714126586914, + "reward_std": 0.25494593381881714, + "rewards/reward_fn/mean": 3.134714126586914, + "rewards/reward_fn/std": 0.25494590401649475, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.0, + "completions/max_terminated_length": 1603.0, + "completions/mean_length": 340.3125, + "completions/mean_terminated_length": 340.3125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.022912909727378805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.02126729814335704, + "learning_rate": 7.913999999999999e-06, + "loss": -0.0127, + "num_tokens": 10058369.0, + "reward": 3.2720999717712402, + "reward_std": 0.5431775450706482, + "rewards/reward_fn/mean": 3.2720999717712402, + "rewards/reward_fn/std": 0.543177604675293, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1504.0, + "completions/max_terminated_length": 1504.0, + "completions/mean_length": 413.90625, + "completions/mean_terminated_length": 413.90625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.023018988013153708, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.016389940166845918, + "learning_rate": 7.9136e-06, + "loss": -0.0359, + "num_tokens": 10107422.0, + "reward": 1.7760989665985107, + "reward_std": 0.02450552210211754, + "rewards/reward_fn/mean": 1.7760989665985107, + "rewards/reward_fn/std": 0.0245054978877306, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 293.40625, + "completions/mean_terminated_length": 293.40625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.02312506629892861, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.023776356363669038, + "learning_rate": 7.913199999999999e-06, + "loss": 0.0182, + "num_tokens": 10158091.0, + "reward": 2.7377381324768066, + "reward_std": 0.02857634611427784, + "rewards/reward_fn/mean": 2.7377381324768066, + "rewards/reward_fn/std": 0.028576355427503586, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1972.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 523.0, + "completions/mean_terminated_length": 523.0, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.02323114458470351, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.021222119219601154, + "learning_rate": 7.9128e-06, + "loss": 0.0815, + "num_tokens": 10225227.0, + "reward": 2.895890951156616, + "reward_std": 1.194340705871582, + "rewards/reward_fn/mean": 2.895890951156616, + "rewards/reward_fn/std": 1.194340705871582, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.023337222870478413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02598167071118951, + "learning_rate": 7.912399999999999e-06, + "loss": 0.0997, + "num_tokens": 10265094.0, + "reward": 3.1712889671325684, + "reward_std": 0.5284618735313416, + "rewards/reward_fn/mean": 3.1712889671325684, + "rewards/reward_fn/std": 0.5284618735313416, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 407.21875, + "completions/mean_terminated_length": 407.21875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.023443301156253316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.01953859266359359, + "learning_rate": 7.912e-06, + "loss": 0.0176, + "num_tokens": 10315277.0, + "reward": 2.7439894676208496, + "reward_std": 0.05666489899158478, + "rewards/reward_fn/mean": 2.7439894676208496, + "rewards/reward_fn/std": 0.056664880365133286, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 309.15625, + "completions/mean_terminated_length": 309.15625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.02354937944202822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.021818128880113363, + "learning_rate": 7.911599999999999e-06, + "loss": 0.0998, + "num_tokens": 10357138.0, + "reward": 3.8890299797058105, + "reward_std": 0.3507172167301178, + "rewards/reward_fn/mean": 3.8890299797058105, + "rewards/reward_fn/std": 0.3507172167301178, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 281.21875, + "completions/mean_terminated_length": 281.21875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.023655457727803118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.02001171070151031, + "learning_rate": 7.9112e-06, + "loss": 0.0008, + "num_tokens": 10401177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 338.4375, + "completions/mean_terminated_length": 338.4375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.02376153601357802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.01955355703830719, + "learning_rate": 7.910799999999999e-06, + "loss": 0.0121, + "num_tokens": 10446823.0, + "reward": 3.107459306716919, + "reward_std": 0.5370301604270935, + "rewards/reward_fn/mean": 3.107459306716919, + "rewards/reward_fn/std": 0.5370301604270935, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 256.90625, + "completions/mean_terminated_length": 256.90625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.023867614299352923, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.02468311577104032, + "learning_rate": 7.9104e-06, + "loss": 0.2088, + "num_tokens": 10490596.0, + "reward": 3.9355549812316895, + "reward_std": 0.25359582901000977, + "rewards/reward_fn/mean": 3.9355549812316895, + "rewards/reward_fn/std": 0.25359582901000977, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1500.0, + "completions/max_terminated_length": 1500.0, + "completions/mean_length": 559.53125, + "completions/mean_terminated_length": 559.53125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.023973692585127826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.01560274779330939, + "learning_rate": 7.91e-06, + "loss": 0.1691, + "num_tokens": 10572181.0, + "reward": 2.4835715293884277, + "reward_std": 0.4638085961341858, + "rewards/reward_fn/mean": 2.4835715293884277, + "rewards/reward_fn/std": 0.4638086259365082, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2047.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 488.25, + "completions/mean_terminated_length": 488.25, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.024079770870902725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.021264664246700704, + "learning_rate": 7.9096e-06, + "loss": -0.0982, + "num_tokens": 10619101.0, + "reward": 2.9061059951782227, + "reward_std": 0.22990155220031738, + "rewards/reward_fn/mean": 2.9061059951782227, + "rewards/reward_fn/std": 0.22990158200263977, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1757.0, + "completions/max_terminated_length": 1757.0, + "completions/mean_length": 547.8125, + "completions/mean_terminated_length": 547.8125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.024185849156677628, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.023016543593257666, + "learning_rate": 7.9092e-06, + "loss": 0.0536, + "num_tokens": 10666135.0, + "reward": 2.741976022720337, + "reward_std": 0.5150967240333557, + "rewards/reward_fn/mean": 2.741976022720337, + "rewards/reward_fn/std": 0.5150967836380005, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 200.71875, + "completions/mean_terminated_length": 200.71875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.02429192744245253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.03538187500089407, + "learning_rate": 7.9088e-06, + "loss": 0.0014, + "num_tokens": 10700782.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 214.90625, + "completions/mean_terminated_length": 214.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.02439800572822743, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.033329138765111566, + "learning_rate": 7.9084e-06, + "loss": -0.0488, + "num_tokens": 10753611.0, + "reward": 3.9345145225524902, + "reward_std": 0.1547694057226181, + "rewards/reward_fn/mean": 3.9345145225524902, + "rewards/reward_fn/std": 0.1547694057226181, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 139.625, + "completions/mean_terminated_length": 139.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.024504084014002333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212890625, + "kl": 0.038340474013239145, + "learning_rate": 7.908e-06, + "loss": 0.0015, + "num_tokens": 10798399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 558.59375, + "completions/mean_terminated_length": 558.59375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.024610162299777236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.018731832038611174, + "learning_rate": 7.9076e-06, + "loss": 0.0969, + "num_tokens": 10832626.0, + "reward": 2.5503268241882324, + "reward_std": 0.4429418444633484, + "rewards/reward_fn/mean": 2.5503268241882324, + "rewards/reward_fn/std": 0.4429418444633484, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 225.78125, + "completions/mean_terminated_length": 225.78125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.02471624058555214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.023959350073710084, + "learning_rate": 7.9072e-06, + "loss": 0.0026, + "num_tokens": 10879819.0, + "reward": 2.9614815711975098, + "reward_std": 0.45102646946907043, + "rewards/reward_fn/mean": 2.9614815711975098, + "rewards/reward_fn/std": 0.4510264992713928, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 161.75, + "completions/mean_terminated_length": 161.75, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.024822318871327038, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "kl": 0.027287997072562575, + "learning_rate": 7.906799999999999e-06, + "loss": 0.0649, + "num_tokens": 10918083.0, + "reward": 3.9702796936035156, + "reward_std": 0.16812357306480408, + "rewards/reward_fn/mean": 3.9702796936035156, + "rewards/reward_fn/std": 0.16812357306480408, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 264.8125, + "completions/mean_terminated_length": 264.8125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.02492839715710194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.026635492919012904, + "learning_rate": 7.9064e-06, + "loss": -0.0229, + "num_tokens": 10964541.0, + "reward": 3.613354206085205, + "reward_std": 0.5796034336090088, + "rewards/reward_fn/mean": 3.613354206085205, + "rewards/reward_fn/std": 0.5796034336090088, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 338.46875, + "completions/mean_terminated_length": 338.46875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.025034475442876843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.021990014938637614, + "learning_rate": 7.905999999999999e-06, + "loss": -0.0474, + "num_tokens": 11022316.0, + "reward": 3.9591715335845947, + "reward_std": 0.2309606820344925, + "rewards/reward_fn/mean": 3.9591715335845947, + "rewards/reward_fn/std": 0.2309606820344925, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.025140553728651746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.021487332647666335, + "learning_rate": 7.9056e-06, + "loss": 0.0009, + "num_tokens": 11049097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 217.96875, + "completions/mean_terminated_length": 217.96875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.025246632014426645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.023882502922788262, + "learning_rate": 7.9052e-06, + "loss": 0.001, + "num_tokens": 11109512.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 655.53125, + "completions/mean_terminated_length": 610.6129150390625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.025352710300201548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.01844685070682317, + "learning_rate": 7.9048e-06, + "loss": 0.1706, + "num_tokens": 11146073.0, + "reward": 1.8283706903457642, + "reward_std": 0.5581537485122681, + "rewards/reward_fn/mean": 1.8283706903457642, + "rewards/reward_fn/std": 0.5581536889076233, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 987.75, + "completions/mean_terminated_length": 878.0689697265625, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.02545878858597645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.012791063985787332, + "learning_rate": 7.9044e-06, + "loss": 0.2083, + "num_tokens": 11222033.0, + "reward": 2.3440942764282227, + "reward_std": 0.8046829700469971, + "rewards/reward_fn/mean": 2.3440942764282227, + "rewards/reward_fn/std": 0.8046829700469971, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 244.0625, + "completions/mean_terminated_length": 244.0625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.025564866871751354, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02614483702927828, + "learning_rate": 7.904e-06, + "loss": 0.0013, + "num_tokens": 11268531.0, + "reward": 2.8031463623046875, + "reward_std": 0.05802328139543533, + "rewards/reward_fn/mean": 2.8031463623046875, + "rewards/reward_fn/std": 0.05802330747246742, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 230.8125, + "completions/mean_terminated_length": 230.8125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.025670945157526253, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.016728723421692848, + "learning_rate": 7.9036e-06, + "loss": 0.032, + "num_tokens": 11311533.0, + "reward": 2.816776752471924, + "reward_std": 0.21942509710788727, + "rewards/reward_fn/mean": 2.816776752471924, + "rewards/reward_fn/std": 0.21942508220672607, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 285.65625, + "completions/mean_terminated_length": 285.65625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.025777023443301156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.016763758845627308, + "learning_rate": 7.903199999999999e-06, + "loss": 0.0875, + "num_tokens": 11335810.0, + "reward": 3.3429794311523438, + "reward_std": 0.5541702508926392, + "rewards/reward_fn/mean": 3.3429794311523438, + "rewards/reward_fn/std": 0.5541702508926392, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 222.65625, + "completions/mean_terminated_length": 222.65625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.02588310172907606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.02172936638817191, + "learning_rate": 7.9028e-06, + "loss": -0.0466, + "num_tokens": 11380663.0, + "reward": 3.145770788192749, + "reward_std": 0.5794350504875183, + "rewards/reward_fn/mean": 3.145770788192749, + "rewards/reward_fn/std": 0.5794350504875183, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 131.8125, + "completions/mean_terminated_length": 131.8125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.02598918001485096, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.421875, + "kl": 0.019724910031072795, + "learning_rate": 7.902399999999999e-06, + "loss": 0.0672, + "num_tokens": 11414193.0, + "reward": 3.9744322299957275, + "reward_std": 0.1446334570646286, + "rewards/reward_fn/mean": 3.9744322299957275, + "rewards/reward_fn/std": 0.1446334421634674, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 425.40625, + "completions/mean_terminated_length": 425.40625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.02609525830062586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.015981771517544985, + "learning_rate": 7.902e-06, + "loss": -0.0546, + "num_tokens": 11467390.0, + "reward": 2.7777838706970215, + "reward_std": 0.47854653000831604, + "rewards/reward_fn/mean": 2.7777838706970215, + "rewards/reward_fn/std": 0.47854653000831604, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 342.6875, + "completions/mean_terminated_length": 342.6875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.026201336586400763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.017373082577250898, + "learning_rate": 7.901599999999999e-06, + "loss": -0.0624, + "num_tokens": 11513652.0, + "reward": 2.9905588626861572, + "reward_std": 0.4717329740524292, + "rewards/reward_fn/mean": 2.9905588626861572, + "rewards/reward_fn/std": 0.4717329442501068, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 108.5625, + "completions/mean_terminated_length": 108.5625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.026307414872175666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.02236648928374052, + "learning_rate": 7.9012e-06, + "loss": 0.0009, + "num_tokens": 11534086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.02641349315795057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.022781465435400605, + "learning_rate": 7.9008e-06, + "loss": 0.0046, + "num_tokens": 11575219.0, + "reward": 2.9033608436584473, + "reward_std": 0.2065192610025406, + "rewards/reward_fn/mean": 2.9033608436584473, + "rewards/reward_fn/std": 0.2065192610025406, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 749.75, + "completions/mean_terminated_length": 707.8709716796875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.02651957144372547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.011985445278696716, + "learning_rate": 7.9004e-06, + "loss": 0.1933, + "num_tokens": 11636395.0, + "reward": 2.566638469696045, + "reward_std": 0.4702640473842621, + "rewards/reward_fn/mean": 2.566638469696045, + "rewards/reward_fn/std": 0.4702640473842621, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 280.78125, + "completions/mean_terminated_length": 280.78125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.02662564972950037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.01918622641824186, + "learning_rate": 7.9e-06, + "loss": 0.1319, + "num_tokens": 11663652.0, + "reward": 3.4576807022094727, + "reward_std": 0.5513618588447571, + "rewards/reward_fn/mean": 3.4576807022094727, + "rewards/reward_fn/std": 0.5513618588447571, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 303.875, + "completions/mean_terminated_length": 303.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.026731728015275274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.02559892018325627, + "learning_rate": 7.8996e-06, + "loss": 0.0889, + "num_tokens": 11714272.0, + "reward": 3.407193660736084, + "reward_std": 0.6705021262168884, + "rewards/reward_fn/mean": 3.407193660736084, + "rewards/reward_fn/std": 0.6705020666122437, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 320.625, + "completions/mean_terminated_length": 320.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.026837806301050177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.01899566757492721, + "learning_rate": 7.8992e-06, + "loss": 0.1479, + "num_tokens": 11761556.0, + "reward": 3.74670672416687, + "reward_std": 0.7026734948158264, + "rewards/reward_fn/mean": 3.74670672416687, + "rewards/reward_fn/std": 0.7026734352111816, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 545.0, + "completions/mean_terminated_length": 545.0, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.026943884586825076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.013679098337888718, + "learning_rate": 7.8988e-06, + "loss": 0.038, + "num_tokens": 11811060.0, + "reward": 3.042778491973877, + "reward_std": 0.6887111067771912, + "rewards/reward_fn/mean": 3.042778491973877, + "rewards/reward_fn/std": 0.6887110471725464, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 228.78125, + "completions/mean_terminated_length": 228.78125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.02704996287259998, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.031039110152050853, + "learning_rate": 7.898399999999999e-06, + "loss": 0.1049, + "num_tokens": 11849101.0, + "reward": 3.9631972312927246, + "reward_std": 0.20818859338760376, + "rewards/reward_fn/mean": 3.9631972312927246, + "rewards/reward_fn/std": 0.20818862318992615, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 468.09375, + "completions/mean_terminated_length": 468.09375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.02715604115837488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.017622511251829565, + "learning_rate": 7.898e-06, + "loss": 0.069, + "num_tokens": 11912592.0, + "reward": 3.587637186050415, + "reward_std": 0.6166492700576782, + "rewards/reward_fn/mean": 3.587637186050415, + "rewards/reward_fn/std": 0.616649329662323, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 191.5625, + "completions/mean_terminated_length": 191.5625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.027262119444149784, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.02548597170971334, + "learning_rate": 7.897599999999999e-06, + "loss": -0.0541, + "num_tokens": 11946626.0, + "reward": 3.064663887023926, + "reward_std": 0.07985293865203857, + "rewards/reward_fn/mean": 3.064663887023926, + "rewards/reward_fn/std": 0.07985293865203857, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 253.1875, + "completions/mean_terminated_length": 253.1875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.027368197729924684, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.021782919066026807, + "learning_rate": 7.8972e-06, + "loss": 0.1768, + "num_tokens": 11987752.0, + "reward": 3.9413747787475586, + "reward_std": 0.23125647008419037, + "rewards/reward_fn/mean": 3.9413747787475586, + "rewards/reward_fn/std": 0.23125645518302917, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 218.46875, + "completions/mean_terminated_length": 218.46875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.027474276015699586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.024505440145730972, + "learning_rate": 7.896799999999999e-06, + "loss": 0.001, + "num_tokens": 12025047.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1152.0, + "completions/max_terminated_length": 1152.0, + "completions/mean_length": 329.96875, + "completions/mean_terminated_length": 329.96875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.02758035430147449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.018241875804960728, + "learning_rate": 7.8964e-06, + "loss": 0.0007, + "num_tokens": 12068118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 265.40625, + "completions/mean_terminated_length": 265.40625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.02768643258724939, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.02741927863098681, + "learning_rate": 7.896e-06, + "loss": 0.0059, + "num_tokens": 12113347.0, + "reward": 3.966427803039551, + "reward_std": 0.18991301953792572, + "rewards/reward_fn/mean": 3.966427803039551, + "rewards/reward_fn/std": 0.18991298973560333, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 277.5625, + "completions/mean_terminated_length": 277.5625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.02779251087302429, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.021280562039464712, + "learning_rate": 7.8956e-06, + "loss": 0.0978, + "num_tokens": 12151765.0, + "reward": 2.54994535446167, + "reward_std": 0.4797287583351135, + "rewards/reward_fn/mean": 2.54994535446167, + "rewards/reward_fn/std": 0.4797287583351135, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 382.75, + "completions/mean_terminated_length": 382.75, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.027898589158799194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.018151523312553763, + "learning_rate": 7.8952e-06, + "loss": 0.0068, + "num_tokens": 12201773.0, + "reward": 3.4161887168884277, + "reward_std": 0.8646740317344666, + "rewards/reward_fn/mean": 3.4161887168884277, + "rewards/reward_fn/std": 0.8646739721298218, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 499.71875, + "completions/mean_terminated_length": 499.71875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.028004667444574097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.018706355476751924, + "learning_rate": 7.8948e-06, + "loss": -0.0482, + "num_tokens": 12246948.0, + "reward": 2.617680072784424, + "reward_std": 0.36669453978538513, + "rewards/reward_fn/mean": 2.617680072784424, + "rewards/reward_fn/std": 0.3666945695877075, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 204.09375, + "completions/mean_terminated_length": 204.09375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.028110745730348996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.037441390566527843, + "learning_rate": 7.8944e-06, + "loss": -0.026, + "num_tokens": 12291847.0, + "reward": 3.009129047393799, + "reward_std": 0.32852211594581604, + "rewards/reward_fn/mean": 3.009129047393799, + "rewards/reward_fn/std": 0.32852208614349365, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 160.5625, + "completions/mean_terminated_length": 160.5625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.0282168240161239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.020008231746032834, + "learning_rate": 7.894e-06, + "loss": 0.0008, + "num_tokens": 12335417.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0283229023018988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.023937980644404888, + "learning_rate": 7.8936e-06, + "loss": -0.0182, + "num_tokens": 12374997.0, + "reward": 1.8745217323303223, + "reward_std": 0.3389831781387329, + "rewards/reward_fn/mean": 1.8745217323303223, + "rewards/reward_fn/std": 0.3389831483364105, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 223.4375, + "completions/mean_terminated_length": 223.4375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.028428980587673704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.021832899190485477, + "learning_rate": 7.8932e-06, + "loss": -0.1111, + "num_tokens": 12416195.0, + "reward": 2.994629144668579, + "reward_std": 0.06882744282484055, + "rewards/reward_fn/mean": 2.994629144668579, + "rewards/reward_fn/std": 0.06882745027542114, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 251.25, + "completions/mean_terminated_length": 251.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.028535058873448604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.023315031314268708, + "learning_rate": 7.8928e-06, + "loss": 0.0371, + "num_tokens": 12494667.0, + "reward": 3.4436919689178467, + "reward_std": 0.7342358827590942, + "rewards/reward_fn/mean": 3.4436919689178467, + "rewards/reward_fn/std": 0.7342358231544495, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 209.03125, + "completions/mean_terminated_length": 209.03125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.028641137159223506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.02203134004957974, + "learning_rate": 7.8924e-06, + "loss": 0.0179, + "num_tokens": 12551692.0, + "reward": 3.892535924911499, + "reward_std": 0.4451846778392792, + "rewards/reward_fn/mean": 3.892535924911499, + "rewards/reward_fn/std": 0.4451846778392792, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 119.3125, + "completions/mean_terminated_length": 119.3125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.02874721544499841, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.023293037782423198, + "learning_rate": 7.892e-06, + "loss": 0.2132, + "num_tokens": 12590710.0, + "reward": 3.9130430221557617, + "reward_std": 0.2241009622812271, + "rewards/reward_fn/mean": 3.9130430221557617, + "rewards/reward_fn/std": 0.2241009771823883, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 172.78125, + "completions/mean_terminated_length": 172.78125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.028853293730773312, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.02628507581539452, + "learning_rate": 7.8916e-06, + "loss": 0.0783, + "num_tokens": 12637647.0, + "reward": 3.2433860301971436, + "reward_std": 0.5124220252037048, + "rewards/reward_fn/mean": 3.2433860301971436, + "rewards/reward_fn/std": 0.5124220848083496, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.02895937201654821, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.02240662043914199, + "learning_rate": 7.8912e-06, + "loss": 0.0431, + "num_tokens": 12675627.0, + "reward": 2.7977585792541504, + "reward_std": 0.06745248287916183, + "rewards/reward_fn/mean": 2.7977585792541504, + "rewards/reward_fn/std": 0.06745246052742004, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 300.96875, + "completions/mean_terminated_length": 300.96875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.029065450302323114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.020187442656606436, + "learning_rate": 7.890799999999999e-06, + "loss": 0.0213, + "num_tokens": 12713418.0, + "reward": 3.9665956497192383, + "reward_std": 0.1889638453722, + "rewards/reward_fn/mean": 3.9665956497192383, + "rewards/reward_fn/std": 0.1889638453722, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 269.59375, + "completions/mean_terminated_length": 269.59375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.029171528588098017, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.024061910109594464, + "learning_rate": 7.8904e-06, + "loss": -0.1597, + "num_tokens": 12757149.0, + "reward": 2.2087552547454834, + "reward_std": 0.5087428689002991, + "rewards/reward_fn/mean": 2.2087552547454834, + "rewards/reward_fn/std": 0.5087428092956543, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 96.21875, + "completions/mean_terminated_length": 96.21875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.02927760687387292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.026183703215792775, + "learning_rate": 7.889999999999999e-06, + "loss": 0.001, + "num_tokens": 12797764.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 176.28125, + "completions/mean_terminated_length": 176.28125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.02938368515964782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.022986828815191984, + "learning_rate": 7.8896e-06, + "loss": 0.0009, + "num_tokens": 12843533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 154.15625, + "completions/mean_terminated_length": 154.15625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.02948976344542272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.021190900588408113, + "learning_rate": 7.889199999999999e-06, + "loss": 0.0008, + "num_tokens": 12879122.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 180.46875, + "completions/mean_terminated_length": 180.46875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.029595841731197624, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.029538776027038693, + "learning_rate": 7.8888e-06, + "loss": 0.0242, + "num_tokens": 12913409.0, + "reward": 3.92836332321167, + "reward_std": 0.4052387773990631, + "rewards/reward_fn/mean": 3.92836332321167, + "rewards/reward_fn/std": 0.4052387773990631, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 333.78125, + "completions/mean_terminated_length": 333.78125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.029701920016972527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.021093905437737703, + "learning_rate": 7.888399999999999e-06, + "loss": 0.0428, + "num_tokens": 12975962.0, + "reward": 3.458969831466675, + "reward_std": 0.5869243144989014, + "rewards/reward_fn/mean": 3.458969831466675, + "rewards/reward_fn/std": 0.5869242548942566, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 189.65625, + "completions/mean_terminated_length": 189.65625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.029807998302747427, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.453125, + "kl": 0.028214870719239116, + "learning_rate": 7.888e-06, + "loss": 0.1478, + "num_tokens": 13014543.0, + "reward": 3.90067982673645, + "reward_std": 0.4174049198627472, + "rewards/reward_fn/mean": 3.90067982673645, + "rewards/reward_fn/std": 0.4174049496650696, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 263.96875, + "completions/mean_terminated_length": 263.96875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.02991407658852233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.022579851211048663, + "learning_rate": 7.887599999999999e-06, + "loss": -0.0188, + "num_tokens": 13040206.0, + "reward": 3.9292826652526855, + "reward_std": 0.20690298080444336, + "rewards/reward_fn/mean": 3.9292826652526855, + "rewards/reward_fn/std": 0.20690296590328217, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 453.9375, + "completions/mean_terminated_length": 402.51611328125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.030020154874297232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.017965498962439597, + "learning_rate": 7.8872e-06, + "loss": 0.1905, + "num_tokens": 13089612.0, + "reward": 3.508497953414917, + "reward_std": 0.9100804924964905, + "rewards/reward_fn/mean": 3.508497953414917, + "rewards/reward_fn/std": 0.9100804924964905, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 403.90625, + "completions/mean_terminated_length": 403.90625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.030126233160072135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.01794223056640476, + "learning_rate": 7.8868e-06, + "loss": -0.1122, + "num_tokens": 13136905.0, + "reward": 2.901538372039795, + "reward_std": 0.2880001664161682, + "rewards/reward_fn/mean": 2.901538372039795, + "rewards/reward_fn/std": 0.28800004720687866, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 193.9375, + "completions/mean_terminated_length": 193.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.030232311445847034, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.027612620033323765, + "learning_rate": 7.8864e-06, + "loss": 0.028, + "num_tokens": 13170535.0, + "reward": 3.2102768421173096, + "reward_std": 0.1197659894824028, + "rewards/reward_fn/mean": 3.2102768421173096, + "rewards/reward_fn/std": 0.11976601183414459, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 256.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.030338389731621937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.023041230160742998, + "learning_rate": 7.886e-06, + "loss": 0.0101, + "num_tokens": 13207031.0, + "reward": 3.574709892272949, + "reward_std": 0.5251328945159912, + "rewards/reward_fn/mean": 3.574709892272949, + "rewards/reward_fn/std": 0.5251328945159912, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 473.625, + "completions/mean_terminated_length": 473.625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.03044446801739684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.01666165341157466, + "learning_rate": 7.8856e-06, + "loss": -0.0288, + "num_tokens": 13259595.0, + "reward": 2.8182735443115234, + "reward_std": 0.23360121250152588, + "rewards/reward_fn/mean": 2.8182735443115234, + "rewards/reward_fn/std": 0.23360122740268707, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 309.34375, + "completions/mean_terminated_length": 309.34375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.03055054630317174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.019797870074398816, + "learning_rate": 7.8852e-06, + "loss": -0.0027, + "num_tokens": 13313430.0, + "reward": 3.900583505630493, + "reward_std": 0.4218868315219879, + "rewards/reward_fn/mean": 3.900583505630493, + "rewards/reward_fn/std": 0.4218868315219879, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 285.25, + "completions/mean_terminated_length": 285.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.030656624588946642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.01816290069837123, + "learning_rate": 7.8848e-06, + "loss": 0.0179, + "num_tokens": 13357246.0, + "reward": 3.9340410232543945, + "reward_std": 0.26044347882270813, + "rewards/reward_fn/mean": 3.9340410232543945, + "rewards/reward_fn/std": 0.26044347882270813, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2018.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 643.9375, + "completions/mean_terminated_length": 643.9375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.030762702874721545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.015570356510579586, + "learning_rate": 7.8844e-06, + "loss": 0.0858, + "num_tokens": 13424508.0, + "reward": 2.594058036804199, + "reward_std": 0.24677404761314392, + "rewards/reward_fn/mean": 2.594058036804199, + "rewards/reward_fn/std": 0.24677406251430511, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1715.0, + "completions/mean_length": 535.375, + "completions/mean_terminated_length": 486.58062744140625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.030868781160496447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.017655761679634452, + "learning_rate": 7.884e-06, + "loss": 0.1712, + "num_tokens": 13456200.0, + "reward": 3.049564838409424, + "reward_std": 1.0932600498199463, + "rewards/reward_fn/mean": 3.049564838409424, + "rewards/reward_fn/std": 1.0932600498199463, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.030974859446271347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0212192558683455, + "learning_rate": 7.8836e-06, + "loss": -0.0082, + "num_tokens": 13495734.0, + "reward": 2.9622116088867188, + "reward_std": 0.20986686646938324, + "rewards/reward_fn/mean": 2.9622116088867188, + "rewards/reward_fn/std": 0.20986689627170563, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 581.125, + "completions/mean_terminated_length": 581.125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.03108093773204625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.018238925491459668, + "learning_rate": 7.8832e-06, + "loss": 0.0007, + "num_tokens": 13553274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.031187016017821152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.021185664576478302, + "learning_rate": 7.882799999999998e-06, + "loss": 0.0008, + "num_tokens": 13605840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 239.71875, + "completions/mean_terminated_length": 239.71875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.03129309430359605, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.027498473413288593, + "learning_rate": 7.8824e-06, + "loss": 0.1091, + "num_tokens": 13650791.0, + "reward": 3.749105215072632, + "reward_std": 0.48261404037475586, + "rewards/reward_fn/mean": 3.749105215072632, + "rewards/reward_fn/std": 0.48261401057243347, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 110.8125, + "completions/mean_terminated_length": 110.8125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.03139917258937096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.03405557991936803, + "learning_rate": 7.882e-06, + "loss": 0.0014, + "num_tokens": 13671649.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 365.84375, + "completions/mean_terminated_length": 365.84375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.03150525087514586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.021357741905376315, + "learning_rate": 7.8816e-06, + "loss": 0.0009, + "num_tokens": 13722908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 301.8125, + "completions/mean_terminated_length": 301.8125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.031611329160920756, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.02022064500488341, + "learning_rate": 7.8812e-06, + "loss": 0.0568, + "num_tokens": 13758518.0, + "reward": 3.889503002166748, + "reward_std": 0.4602510929107666, + "rewards/reward_fn/mean": 3.889503002166748, + "rewards/reward_fn/std": 0.460251122713089, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 824.1875, + "completions/mean_terminated_length": 784.7096557617188, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.03171740744669566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.011437640176154673, + "learning_rate": 7.880799999999999e-06, + "loss": 0.1652, + "num_tokens": 13835068.0, + "reward": 3.2491872310638428, + "reward_std": 0.6871760487556458, + "rewards/reward_fn/mean": 3.2491872310638428, + "rewards/reward_fn/std": 0.6871760487556458, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 319.59375, + "completions/mean_terminated_length": 319.59375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.03182348573247056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.021747147664427757, + "learning_rate": 7.8804e-06, + "loss": 0.1173, + "num_tokens": 13875791.0, + "reward": 2.884753465652466, + "reward_std": 0.29689720273017883, + "rewards/reward_fn/mean": 2.884753465652466, + "rewards/reward_fn/std": 0.2968972325325012, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 842.90625, + "completions/mean_terminated_length": 762.5667114257812, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.03192956401824547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.011148195248097181, + "learning_rate": 7.879999999999999e-06, + "loss": 0.0603, + "num_tokens": 13952556.0, + "reward": 2.296818256378174, + "reward_std": 0.7491377592086792, + "rewards/reward_fn/mean": 2.296818256378174, + "rewards/reward_fn/std": 0.7491377592086792, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 287.40625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.03203564230402037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.026387330144643784, + "learning_rate": 7.8796e-06, + "loss": -0.0635, + "num_tokens": 14014201.0, + "reward": 3.5167012214660645, + "reward_std": 0.7784268856048584, + "rewards/reward_fn/mean": 3.5167012214660645, + "rewards/reward_fn/std": 0.7784268856048584, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 256.3125, + "completions/mean_terminated_length": 256.3125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.03214172058979527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.01922440528869629, + "learning_rate": 7.879199999999999e-06, + "loss": 0.031, + "num_tokens": 14063331.0, + "reward": 3.7592110633850098, + "reward_std": 0.5460023880004883, + "rewards/reward_fn/mean": 3.7592110633850098, + "rewards/reward_fn/std": 0.5460023880004883, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 286.625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.03224779887557017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.022301261080428958, + "learning_rate": 7.8788e-06, + "loss": 0.0009, + "num_tokens": 14111927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 565.21875, + "completions/mean_terminated_length": 517.3870849609375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.03235387716134507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.01948609808459878, + "learning_rate": 7.878399999999999e-06, + "loss": 0.2713, + "num_tokens": 14167774.0, + "reward": 2.894683837890625, + "reward_std": 0.7454859614372253, + "rewards/reward_fn/mean": 2.894683837890625, + "rewards/reward_fn/std": 0.7454858422279358, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 125.625, + "completions/mean_terminated_length": 125.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.03245995544711997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.024616175913251936, + "learning_rate": 7.878e-06, + "loss": 0.001, + "num_tokens": 14184178.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 184.34375, + "completions/mean_terminated_length": 184.34375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.03256603373289488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.028724384726956487, + "learning_rate": 7.8776e-06, + "loss": -0.0796, + "num_tokens": 14227837.0, + "reward": 3.9284615516662598, + "reward_std": 0.40468308329582214, + "rewards/reward_fn/mean": 3.9284615516662598, + "rewards/reward_fn/std": 0.40468305349349976, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 285.34375, + "completions/mean_terminated_length": 285.34375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.03267211201866978, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.01536768360529095, + "learning_rate": 7.8772e-06, + "loss": 0.0255, + "num_tokens": 14272968.0, + "reward": 2.920250415802002, + "reward_std": 0.02926819771528244, + "rewards/reward_fn/mean": 2.920250415802002, + "rewards/reward_fn/std": 0.02926819957792759, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.03277819030444468, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1875, + "kl": 0.025702545884996653, + "learning_rate": 7.8768e-06, + "loss": 0.1966, + "num_tokens": 14313432.0, + "reward": 3.9622316360473633, + "reward_std": 0.2136494368314743, + "rewards/reward_fn/mean": 3.9622316360473633, + "rewards/reward_fn/std": 0.21364940702915192, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 303.46875, + "completions/mean_terminated_length": 303.46875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.03288426859021958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.017463624943047762, + "learning_rate": 7.8764e-06, + "loss": -0.0187, + "num_tokens": 14358759.0, + "reward": 3.6965291500091553, + "reward_std": 0.6346907615661621, + "rewards/reward_fn/mean": 3.6965291500091553, + "rewards/reward_fn/std": 0.6346907615661621, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 323.84375, + "completions/mean_terminated_length": 323.84375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.03299034687599448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.02454505139030516, + "learning_rate": 7.876e-06, + "loss": -0.0422, + "num_tokens": 14401026.0, + "reward": 3.7419800758361816, + "reward_std": 0.49619293212890625, + "rewards/reward_fn/mean": 3.7419800758361816, + "rewards/reward_fn/std": 0.49619296193122864, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 86.21875, + "completions/mean_terminated_length": 86.21875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.03309642516176939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.02460471703670919, + "learning_rate": 7.8756e-06, + "loss": 0.001, + "num_tokens": 14444553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 350.90625, + "completions/mean_terminated_length": 350.90625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.03320250344754429, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.016784140723757446, + "learning_rate": 7.8752e-06, + "loss": -0.0192, + "num_tokens": 14516326.0, + "reward": 2.9048171043395996, + "reward_std": 0.38163191080093384, + "rewards/reward_fn/mean": 2.9048171043395996, + "rewards/reward_fn/std": 0.38163191080093384, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 350.0625, + "completions/mean_terminated_length": 295.2903137207031, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.03330858173331919, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.026702984934672713, + "learning_rate": 7.8748e-06, + "loss": 0.1718, + "num_tokens": 14566088.0, + "reward": 2.980964422225952, + "reward_std": 1.1688123941421509, + "rewards/reward_fn/mean": 2.980964422225952, + "rewards/reward_fn/std": 1.1688123941421509, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2030.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 474.21875, + "completions/mean_terminated_length": 474.21875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.03341466001909409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.01943380292505026, + "learning_rate": 7.874399999999999e-06, + "loss": 0.081, + "num_tokens": 14593871.0, + "reward": 3.155921459197998, + "reward_std": 0.9191026091575623, + "rewards/reward_fn/mean": 3.155921459197998, + "rewards/reward_fn/std": 0.9191026091575623, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 190.03125, + "completions/mean_terminated_length": 190.03125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03352073830486899, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.024673061445355415, + "learning_rate": 7.874e-06, + "loss": 0.0506, + "num_tokens": 14634384.0, + "reward": 2.570242404937744, + "reward_std": 1.0145124197006226, + "rewards/reward_fn/mean": 2.570242404937744, + "rewards/reward_fn/std": 1.014512300491333, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 525.1875, + "completions/mean_terminated_length": 525.1875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.03362681659064389, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.02036218182183802, + "learning_rate": 7.873599999999999e-06, + "loss": -0.0318, + "num_tokens": 14686390.0, + "reward": 3.0451087951660156, + "reward_std": 0.3707602620124817, + "rewards/reward_fn/mean": 3.0451087951660156, + "rewards/reward_fn/std": 0.3707602024078369, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 466.53125, + "completions/mean_terminated_length": 415.51611328125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.0337328948764188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.014624205301515758, + "learning_rate": 7.8732e-06, + "loss": 0.1828, + "num_tokens": 14732903.0, + "reward": 3.0857720375061035, + "reward_std": 0.7979248762130737, + "rewards/reward_fn/mean": 3.0857720375061035, + "rewards/reward_fn/std": 0.797924816608429, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1788.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 344.84375, + "completions/mean_terminated_length": 344.84375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.0338389731621937, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.02286008303053677, + "learning_rate": 7.8728e-06, + "loss": 0.0713, + "num_tokens": 14776674.0, + "reward": 2.7754576206207275, + "reward_std": 0.20874054729938507, + "rewards/reward_fn/mean": 2.7754576206207275, + "rewards/reward_fn/std": 0.20874051749706268, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 347.28125, + "completions/mean_terminated_length": 347.28125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.033945051447968604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.01978490618057549, + "learning_rate": 7.8724e-06, + "loss": -0.0013, + "num_tokens": 14825035.0, + "reward": 2.7217154502868652, + "reward_std": 0.0514136478304863, + "rewards/reward_fn/mean": 2.7217154502868652, + "rewards/reward_fn/std": 0.05141367390751839, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 319.34375, + "completions/mean_terminated_length": 319.34375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.0340511297337435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.01841491786763072, + "learning_rate": 7.872e-06, + "loss": 0.067, + "num_tokens": 14884118.0, + "reward": 3.8295905590057373, + "reward_std": 0.5808995962142944, + "rewards/reward_fn/mean": 3.8295905590057373, + "rewards/reward_fn/std": 0.5808995962142944, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 412.78125, + "completions/mean_terminated_length": 412.78125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.0341572080195184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.015279840677976608, + "learning_rate": 7.8716e-06, + "loss": 0.0439, + "num_tokens": 14942063.0, + "reward": 3.450852870941162, + "reward_std": 0.6985296607017517, + "rewards/reward_fn/mean": 3.450852870941162, + "rewards/reward_fn/std": 0.6985296607017517, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 123.1875, + "completions/mean_terminated_length": 123.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03426328630529331, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.01893604954238981, + "learning_rate": 7.8712e-06, + "loss": 0.0116, + "num_tokens": 14967285.0, + "reward": 3.701681613922119, + "reward_std": 0.4854101240634918, + "rewards/reward_fn/mean": 3.701681613922119, + "rewards/reward_fn/std": 0.48541009426116943, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 294.5, + "completions/mean_terminated_length": 294.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.03436936459106821, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.02002789406105876, + "learning_rate": 7.8708e-06, + "loss": -0.0665, + "num_tokens": 15018021.0, + "reward": 3.452935218811035, + "reward_std": 0.6924206018447876, + "rewards/reward_fn/mean": 3.452935218811035, + "rewards/reward_fn/std": 0.6924206018447876, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 431.09375, + "completions/mean_terminated_length": 431.09375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.03447544287684311, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.022256616270169616, + "learning_rate": 7.8704e-06, + "loss": 0.141, + "num_tokens": 15068488.0, + "reward": 3.595818519592285, + "reward_std": 0.6756666898727417, + "rewards/reward_fn/mean": 3.595818519592285, + "rewards/reward_fn/std": 0.6756666898727417, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 260.5625, + "completions/mean_terminated_length": 260.5625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.03458152116261801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.01728574268054217, + "learning_rate": 7.87e-06, + "loss": 0.0252, + "num_tokens": 15110010.0, + "reward": 3.9640228748321533, + "reward_std": 0.2035173624753952, + "rewards/reward_fn/mean": 3.9640228748321533, + "rewards/reward_fn/std": 0.2035173922777176, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 201.53125, + "completions/mean_terminated_length": 201.53125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.03468759944839291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.019541570218279958, + "learning_rate": 7.8696e-06, + "loss": 0.0094, + "num_tokens": 15161835.0, + "reward": 2.861398220062256, + "reward_std": 0.22285261750221252, + "rewards/reward_fn/mean": 2.861398220062256, + "rewards/reward_fn/std": 0.22285260260105133, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 106.59375, + "completions/mean_terminated_length": 106.59375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.03479367773416782, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "kl": 0.020829411456361413, + "learning_rate": 7.869199999999999e-06, + "loss": 0.0216, + "num_tokens": 15199870.0, + "reward": 3.7141122817993164, + "reward_std": 0.46484532952308655, + "rewards/reward_fn/mean": 3.7141122817993164, + "rewards/reward_fn/std": 0.46484535932540894, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 348.71875, + "completions/mean_terminated_length": 348.71875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.03489975601994272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.021267442498356104, + "learning_rate": 7.8688e-06, + "loss": 0.0477, + "num_tokens": 15240917.0, + "reward": 2.838244915008545, + "reward_std": 0.0607428215444088, + "rewards/reward_fn/mean": 2.838244915008545, + "rewards/reward_fn/std": 0.060742802917957306, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.03500583430571762, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.026442324509844184, + "learning_rate": 7.8684e-06, + "loss": -0.1112, + "num_tokens": 15287893.0, + "reward": 3.518838405609131, + "reward_std": 0.8920819759368896, + "rewards/reward_fn/mean": 3.518838405609131, + "rewards/reward_fn/std": 0.8920818567276001, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 199.4375, + "completions/mean_terminated_length": 199.4375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.035111912591492524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.027465834515169263, + "learning_rate": 7.868e-06, + "loss": 0.0479, + "num_tokens": 15326755.0, + "reward": 3.804008722305298, + "reward_std": 0.3705672025680542, + "rewards/reward_fn/mean": 3.804008722305298, + "rewards/reward_fn/std": 0.3705671727657318, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 330.84375, + "completions/mean_terminated_length": 330.84375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.03521799087726742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0256270794197917, + "learning_rate": 7.8676e-06, + "loss": -0.0112, + "num_tokens": 15370622.0, + "reward": 2.7658400535583496, + "reward_std": 0.19479042291641235, + "rewards/reward_fn/mean": 2.7658400535583496, + "rewards/reward_fn/std": 0.19479040801525116, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1153.0, + "completions/mean_length": 780.15625, + "completions/mean_terminated_length": 599.0357666015625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.03532406916304232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.01505154138430953, + "learning_rate": 7.8672e-06, + "loss": 0.3167, + "num_tokens": 15439171.0, + "reward": 2.6815433502197266, + "reward_std": 1.2791556119918823, + "rewards/reward_fn/mean": 2.6815433502197266, + "rewards/reward_fn/std": 1.2791556119918823, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 340.4375, + "completions/mean_terminated_length": 340.4375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.03543014744881723, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.022124167764559388, + "learning_rate": 7.866799999999999e-06, + "loss": -0.0204, + "num_tokens": 15488241.0, + "reward": 3.7875170707702637, + "reward_std": 0.6110662221908569, + "rewards/reward_fn/mean": 3.7875170707702637, + "rewards/reward_fn/std": 0.6110662221908569, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 251.84375, + "completions/mean_terminated_length": 251.84375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.03553622573459213, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.023350659990683198, + "learning_rate": 7.8664e-06, + "loss": 0.013, + "num_tokens": 15539180.0, + "reward": 2.956249475479126, + "reward_std": 0.4766891896724701, + "rewards/reward_fn/mean": 2.956249475479126, + "rewards/reward_fn/std": 0.4766892194747925, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 694.25, + "completions/mean_terminated_length": 650.5806274414062, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.035642304020367034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.02155219833366573, + "learning_rate": 7.865999999999999e-06, + "loss": 0.1593, + "num_tokens": 15606964.0, + "reward": 2.4331278800964355, + "reward_std": 0.6054124236106873, + "rewards/reward_fn/mean": 2.4331278800964355, + "rewards/reward_fn/std": 0.6054123640060425, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1832.0, + "completions/max_terminated_length": 1832.0, + "completions/mean_length": 321.46875, + "completions/mean_terminated_length": 321.46875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.03574838230614193, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.03199986438266933, + "learning_rate": 7.8656e-06, + "loss": 0.0461, + "num_tokens": 15647139.0, + "reward": 2.767442226409912, + "reward_std": 0.20596739649772644, + "rewards/reward_fn/mean": 2.767442226409912, + "rewards/reward_fn/std": 0.20596742630004883, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 119.90625, + "completions/mean_terminated_length": 119.90625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03585446059191683, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.03643982787616551, + "learning_rate": 7.865199999999999e-06, + "loss": -0.0177, + "num_tokens": 15696160.0, + "reward": 3.380324602127075, + "reward_std": 0.5965169072151184, + "rewards/reward_fn/mean": 3.380324602127075, + "rewards/reward_fn/std": 0.5965169072151184, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 206.71875, + "completions/mean_terminated_length": 206.71875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.03596053887769174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.030227781971916556, + "learning_rate": 7.8648e-06, + "loss": 0.0012, + "num_tokens": 15742903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 330.0625, + "completions/mean_terminated_length": 330.0625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.03606661716346664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.023755632108077407, + "learning_rate": 7.864399999999999e-06, + "loss": 0.0689, + "num_tokens": 15790329.0, + "reward": 2.75354266166687, + "reward_std": 0.03267281502485275, + "rewards/reward_fn/mean": 2.75354266166687, + "rewards/reward_fn/std": 0.03267282247543335, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.03617269544924154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0320469920989126, + "learning_rate": 7.864e-06, + "loss": -0.1167, + "num_tokens": 15817384.0, + "reward": 3.7375216484069824, + "reward_std": 0.8701768517494202, + "rewards/reward_fn/mean": 3.7375216484069824, + "rewards/reward_fn/std": 0.8701767921447754, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 158.28125, + "completions/mean_terminated_length": 158.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.036278773735016444, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.03848015540279448, + "learning_rate": 7.8636e-06, + "loss": 0.0212, + "num_tokens": 15860625.0, + "reward": 3.2080063819885254, + "reward_std": 0.7717536091804504, + "rewards/reward_fn/mean": 3.2080063819885254, + "rewards/reward_fn/std": 0.7717535495758057, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 620.875, + "completions/mean_terminated_length": 525.7333374023438, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.03638485202079134, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.022421202156692743, + "learning_rate": 7.8632e-06, + "loss": 0.1028, + "num_tokens": 15921357.0, + "reward": 2.6614723205566406, + "reward_std": 0.526772677898407, + "rewards/reward_fn/mean": 2.6614723205566406, + "rewards/reward_fn/std": 0.5267727375030518, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 184.40625, + "completions/mean_terminated_length": 184.40625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.03649093030656624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.02825386798940599, + "learning_rate": 7.8628e-06, + "loss": 0.0271, + "num_tokens": 15961850.0, + "reward": 3.163677215576172, + "reward_std": 0.5745911598205566, + "rewards/reward_fn/mean": 3.163677215576172, + "rewards/reward_fn/std": 0.5745911002159119, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 263.0625, + "completions/mean_terminated_length": 263.0625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.03659700859234115, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.03263953677378595, + "learning_rate": 7.8624e-06, + "loss": 0.0094, + "num_tokens": 15981884.0, + "reward": 3.4133460521698, + "reward_std": 0.9804157614707947, + "rewards/reward_fn/mean": 3.4133460521698, + "rewards/reward_fn/std": 0.9804157018661499, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 376.6875, + "completions/mean_terminated_length": 376.6875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.03670308687811605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.028175218030810356, + "learning_rate": 7.862e-06, + "loss": -0.0734, + "num_tokens": 16046098.0, + "reward": 3.028233051300049, + "reward_std": 0.3557929992675781, + "rewards/reward_fn/mean": 3.028233051300049, + "rewards/reward_fn/std": 0.3557929992675781, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 228.1875, + "completions/mean_terminated_length": 228.1875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.036809165163890954, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.03550481074489653, + "learning_rate": 7.8616e-06, + "loss": 0.0374, + "num_tokens": 16086616.0, + "reward": 3.3797616958618164, + "reward_std": 0.6313052177429199, + "rewards/reward_fn/mean": 3.3797616958618164, + "rewards/reward_fn/std": 0.6313052177429199, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 189.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.036915243449665854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.027385680470615625, + "learning_rate": 7.8612e-06, + "loss": 0.0011, + "num_tokens": 16125396.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.03702132173544075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.04070192761719227, + "learning_rate": 7.8608e-06, + "loss": 0.02, + "num_tokens": 16168260.0, + "reward": 3.54952335357666, + "reward_std": 0.7186921834945679, + "rewards/reward_fn/mean": 3.54952335357666, + "rewards/reward_fn/std": 0.7186923027038574, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 363.90625, + "completions/mean_terminated_length": 363.90625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.03712740002121566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.026658386690542102, + "learning_rate": 7.8604e-06, + "loss": -0.0186, + "num_tokens": 16211073.0, + "reward": 3.8557639122009277, + "reward_std": 0.5675714015960693, + "rewards/reward_fn/mean": 3.8557639122009277, + "rewards/reward_fn/std": 0.5675714015960693, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 291.09375, + "completions/mean_terminated_length": 291.09375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.03723347830699056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.028336241375654936, + "learning_rate": 7.86e-06, + "loss": -0.0066, + "num_tokens": 16266500.0, + "reward": 2.8183717727661133, + "reward_std": 0.9820890426635742, + "rewards/reward_fn/mean": 2.8183717727661133, + "rewards/reward_fn/std": 0.9820890426635742, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.03733955659276546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02873558783903718, + "learning_rate": 7.8596e-06, + "loss": -0.0038, + "num_tokens": 16329586.0, + "reward": 2.79421329498291, + "reward_std": 0.18948742747306824, + "rewards/reward_fn/mean": 2.79421329498291, + "rewards/reward_fn/std": 0.18948739767074585, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1763.0, + "completions/mean_length": 680.0625, + "completions/mean_terminated_length": 588.86669921875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.037445634878540364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.02555919042788446, + "learning_rate": 7.8592e-06, + "loss": 0.2532, + "num_tokens": 16373236.0, + "reward": 2.7877817153930664, + "reward_std": 0.9033706784248352, + "rewards/reward_fn/mean": 2.7877817153930664, + "rewards/reward_fn/std": 0.9033706188201904, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 382.59375, + "completions/mean_terminated_length": 328.8709716796875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.03755171316431526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.024913710309192538, + "learning_rate": 7.8588e-06, + "loss": 0.1477, + "num_tokens": 16427431.0, + "reward": 2.9459004402160645, + "reward_std": 1.0123151540756226, + "rewards/reward_fn/mean": 2.9459004402160645, + "rewards/reward_fn/std": 1.0123151540756226, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.03765779145009017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.022564243176020682, + "learning_rate": 7.8584e-06, + "loss": 0.0009, + "num_tokens": 16467051.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 232.84375, + "completions/mean_terminated_length": 232.84375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.03776386973586507, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.032447136007249355, + "learning_rate": 7.858e-06, + "loss": -0.007, + "num_tokens": 16514694.0, + "reward": 2.8739380836486816, + "reward_std": 0.4234299659729004, + "rewards/reward_fn/mean": 2.8739380836486816, + "rewards/reward_fn/std": 0.423429936170578, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 184.78125, + "completions/mean_terminated_length": 184.78125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.03786994802163997, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.03548223176039755, + "learning_rate": 7.857599999999999e-06, + "loss": 0.0056, + "num_tokens": 16551295.0, + "reward": 3.7814149856567383, + "reward_std": 0.5462931990623474, + "rewards/reward_fn/mean": 3.7814149856567383, + "rewards/reward_fn/std": 0.5462931990623474, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.037976026307414874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.02927296655252576, + "learning_rate": 7.8572e-06, + "loss": 0.0012, + "num_tokens": 16601823.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 681.0, + "completions/mean_terminated_length": 636.9031982421875, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.038082104593189774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.02082135993987322, + "learning_rate": 7.856799999999999e-06, + "loss": 0.054, + "num_tokens": 16649087.0, + "reward": 2.0146098136901855, + "reward_std": 0.59377121925354, + "rewards/reward_fn/mean": 2.0146098136901855, + "rewards/reward_fn/std": 0.59377121925354, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 115.71875, + "completions/mean_terminated_length": 115.71875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.03818818287896467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.02015709993429482, + "learning_rate": 7.8564e-06, + "loss": 0.0008, + "num_tokens": 16679478.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 360.28125, + "completions/mean_terminated_length": 360.28125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.03829426116473958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.030813375022262335, + "learning_rate": 7.855999999999999e-06, + "loss": -0.0245, + "num_tokens": 16727167.0, + "reward": 2.8578453063964844, + "reward_std": 0.0672156810760498, + "rewards/reward_fn/mean": 2.8578453063964844, + "rewards/reward_fn/std": 0.0672157034277916, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 116.78125, + "completions/mean_terminated_length": 116.78125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.03840033945051448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.03886803472414613, + "learning_rate": 7.8556e-06, + "loss": 0.0016, + "num_tokens": 16768696.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 97.59375, + "completions/mean_terminated_length": 97.59375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.038506417736289385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.02677657501772046, + "learning_rate": 7.855199999999999e-06, + "loss": 0.0011, + "num_tokens": 16807979.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 323.0625, + "completions/mean_terminated_length": 323.0625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.038612496022064284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.025971536757424474, + "learning_rate": 7.8548e-06, + "loss": 0.0311, + "num_tokens": 16855277.0, + "reward": 3.0506677627563477, + "reward_std": 0.511308491230011, + "rewards/reward_fn/mean": 3.0506677627563477, + "rewards/reward_fn/std": 0.511308491230011, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 440.84375, + "completions/mean_terminated_length": 440.84375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.03871857430783918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.028090456500649452, + "learning_rate": 7.854399999999999e-06, + "loss": 0.0004, + "num_tokens": 16930856.0, + "reward": 1.8972702026367188, + "reward_std": 0.502529501914978, + "rewards/reward_fn/mean": 1.8972702026367188, + "rewards/reward_fn/std": 0.502529501914978, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.03882465259361409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.017430690582841635, + "learning_rate": 7.854e-06, + "loss": 0.0007, + "num_tokens": 16972104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 264.5, + "completions/mean_terminated_length": 264.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.03893073087938899, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02535721193999052, + "learning_rate": 7.8536e-06, + "loss": 0.1011, + "num_tokens": 17014200.0, + "reward": 3.448211193084717, + "reward_std": 0.4737915098667145, + "rewards/reward_fn/mean": 3.448211193084717, + "rewards/reward_fn/std": 0.4737914502620697, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 229.84375, + "completions/mean_terminated_length": 229.84375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.03903680916516389, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.028732048347592354, + "learning_rate": 7.8532e-06, + "loss": 0.0519, + "num_tokens": 17042675.0, + "reward": 3.8204574584960938, + "reward_std": 0.5905638337135315, + "rewards/reward_fn/mean": 3.8204574584960938, + "rewards/reward_fn/std": 0.5905638337135315, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 144.21875, + "completions/mean_terminated_length": 144.21875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.039142887450938794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.02795306663028896, + "learning_rate": 7.8528e-06, + "loss": 0.0011, + "num_tokens": 17072858.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 291.0, + "completions/mean_terminated_length": 291.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.039248965736713694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.024855082854628563, + "learning_rate": 7.8524e-06, + "loss": 0.039, + "num_tokens": 17111994.0, + "reward": 3.0236971378326416, + "reward_std": 0.18900470435619354, + "rewards/reward_fn/mean": 3.0236971378326416, + "rewards/reward_fn/std": 0.18900467455387115, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 169.6875, + "completions/mean_terminated_length": 169.6875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.0393550440224886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.036026960471645, + "learning_rate": 7.852e-06, + "loss": -0.0048, + "num_tokens": 17161744.0, + "reward": 3.9573440551757812, + "reward_std": 0.24129843711853027, + "rewards/reward_fn/mean": 3.9573440551757812, + "rewards/reward_fn/std": 0.24129842221736908, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 207.90625, + "completions/mean_terminated_length": 207.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.0394611223082635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.026199826737865806, + "learning_rate": 7.8516e-06, + "loss": 0.001, + "num_tokens": 17191821.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 195.40625, + "completions/mean_terminated_length": 195.40625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.0395672005940384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.03569826763123274, + "learning_rate": 7.8512e-06, + "loss": 0.0014, + "num_tokens": 17239802.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 500.3125, + "completions/mean_terminated_length": 450.3870849609375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.039673278879813305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.024202445056289434, + "learning_rate": 7.8508e-06, + "loss": 0.0908, + "num_tokens": 17302948.0, + "reward": 2.246391773223877, + "reward_std": 0.7711318135261536, + "rewards/reward_fn/mean": 2.246391773223877, + "rewards/reward_fn/std": 0.7711318135261536, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.039779357165588204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.0259898176882416, + "learning_rate": 7.850399999999999e-06, + "loss": 0.0233, + "num_tokens": 17358384.0, + "reward": 2.9620676040649414, + "reward_std": 0.07208557426929474, + "rewards/reward_fn/mean": 2.9620676040649414, + "rewards/reward_fn/std": 0.07208552956581116, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 231.09375, + "completions/mean_terminated_length": 231.09375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.039885435451363103, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.030499441782012582, + "learning_rate": 7.85e-06, + "loss": -0.0047, + "num_tokens": 17396947.0, + "reward": 3.423821449279785, + "reward_std": 0.5857948660850525, + "rewards/reward_fn/mean": 3.423821449279785, + "rewards/reward_fn/std": 0.5857948660850525, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 451.96875, + "completions/mean_terminated_length": 400.4838562011719, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.03999151373713801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.022297584218904376, + "learning_rate": 7.849599999999999e-06, + "loss": 0.2615, + "num_tokens": 17442866.0, + "reward": 2.704645872116089, + "reward_std": 0.6090136170387268, + "rewards/reward_fn/mean": 2.704645872116089, + "rewards/reward_fn/std": 0.6090136170387268, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.04009759202291291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.021644736174494028, + "learning_rate": 7.8492e-06, + "loss": -0.0128, + "num_tokens": 17487022.0, + "reward": 2.907583713531494, + "reward_std": 0.46556609869003296, + "rewards/reward_fn/mean": 2.907583713531494, + "rewards/reward_fn/std": 0.46556606888771057, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1103.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 316.59375, + "completions/mean_terminated_length": 316.59375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.04020367030868781, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.02532161376439035, + "learning_rate": 7.8488e-06, + "loss": -0.0199, + "num_tokens": 17530977.0, + "reward": 3.670456886291504, + "reward_std": 0.41091057658195496, + "rewards/reward_fn/mean": 3.670456886291504, + "rewards/reward_fn/std": 0.41091054677963257, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1597.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 468.9375, + "completions/mean_terminated_length": 468.9375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.040309748594462715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.018143147695809603, + "learning_rate": 7.8484e-06, + "loss": 0.1946, + "num_tokens": 17587871.0, + "reward": 2.6948769092559814, + "reward_std": 0.26830726861953735, + "rewards/reward_fn/mean": 2.6948769092559814, + "rewards/reward_fn/std": 0.26830726861953735, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 357.5, + "completions/mean_terminated_length": 302.9677429199219, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.040415826880237614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.023040967527776957, + "learning_rate": 7.848e-06, + "loss": 0.2107, + "num_tokens": 17636175.0, + "reward": 3.7304601669311523, + "reward_std": 0.7834086418151855, + "rewards/reward_fn/mean": 3.7304601669311523, + "rewards/reward_fn/std": 0.7834085822105408, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 195.875, + "completions/mean_terminated_length": 195.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.04052190516601252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.02284376136958599, + "learning_rate": 7.8476e-06, + "loss": 0.0009, + "num_tokens": 17657579.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 89.5, + "completions/mean_terminated_length": 89.5, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.04062798345178742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2890625, + "kl": 0.03317818860523403, + "learning_rate": 7.8472e-06, + "loss": 0.0013, + "num_tokens": 17685723.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1824.0, + "completions/mean_length": 1247.1875, + "completions/mean_terminated_length": 1221.3548583984375, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "epoch": 0.04073406173756232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.013792349374853075, + "learning_rate": 7.846799999999999e-06, + "loss": -0.0208, + "num_tokens": 17768065.0, + "reward": 2.3582417964935303, + "reward_std": 0.49515044689178467, + "rewards/reward_fn/mean": 2.3582417964935303, + "rewards/reward_fn/std": 0.4951504170894623, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 341.53125, + "completions/mean_terminated_length": 341.53125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.040840140023337225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.019186724559403956, + "learning_rate": 7.8464e-06, + "loss": 0.0339, + "num_tokens": 17814194.0, + "reward": 2.7873990535736084, + "reward_std": 0.02758314460515976, + "rewards/reward_fn/mean": 2.7873990535736084, + "rewards/reward_fn/std": 0.027583174407482147, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 384.78125, + "completions/mean_terminated_length": 384.78125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.040946218309112124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.024335808353498578, + "learning_rate": 7.845999999999999e-06, + "loss": -0.0277, + "num_tokens": 17859755.0, + "reward": 2.8218817710876465, + "reward_std": 0.051523782312870026, + "rewards/reward_fn/mean": 2.8218817710876465, + "rewards/reward_fn/std": 0.05152379348874092, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 541.5625, + "completions/mean_terminated_length": 492.96771240234375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.041052296594887024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.02720357710495591, + "learning_rate": 7.8456e-06, + "loss": 0.041, + "num_tokens": 17909629.0, + "reward": 2.3109424114227295, + "reward_std": 0.6412019729614258, + "rewards/reward_fn/mean": 2.3109424114227295, + "rewards/reward_fn/std": 0.641201913356781, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.04115837488066193, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.022277627140283585, + "learning_rate": 7.845199999999999e-06, + "loss": 0.2392, + "num_tokens": 17937329.0, + "reward": 3.7471237182617188, + "reward_std": 0.4859867990016937, + "rewards/reward_fn/mean": 3.7471237182617188, + "rewards/reward_fn/std": 0.4859868288040161, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 181.625, + "completions/mean_terminated_length": 181.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.04126445316643683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.033335258485749364, + "learning_rate": 7.8448e-06, + "loss": 0.0013, + "num_tokens": 17977157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.041370531452211735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.024321889970451593, + "learning_rate": 7.8444e-06, + "loss": 0.001, + "num_tokens": 18014102.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 241.03125, + "completions/mean_terminated_length": 241.03125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.041476609737986635, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.02815451007336378, + "learning_rate": 7.844e-06, + "loss": 0.0235, + "num_tokens": 18058551.0, + "reward": 3.104393482208252, + "reward_std": 0.4822303354740143, + "rewards/reward_fn/mean": 3.104393482208252, + "rewards/reward_fn/std": 0.48223036527633667, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 297.96875, + "completions/mean_terminated_length": 297.96875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.041582688023761534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.020035157212987542, + "learning_rate": 7.8436e-06, + "loss": -0.0126, + "num_tokens": 18098454.0, + "reward": 3.4708333015441895, + "reward_std": 0.5728017091751099, + "rewards/reward_fn/mean": 3.4708333015441895, + "rewards/reward_fn/std": 0.5728016495704651, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 300.21875, + "completions/mean_terminated_length": 300.21875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.04168876630953644, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.02791671548038721, + "learning_rate": 7.8432e-06, + "loss": 0.2627, + "num_tokens": 18152669.0, + "reward": 3.1782641410827637, + "reward_std": 1.0244643688201904, + "rewards/reward_fn/mean": 3.1782641410827637, + "rewards/reward_fn/std": 1.0244643688201904, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 455.0, + "completions/mean_terminated_length": 455.0, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.04179484459531134, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.02029509423300624, + "learning_rate": 7.8428e-06, + "loss": -0.0168, + "num_tokens": 18200637.0, + "reward": 2.588132381439209, + "reward_std": 0.3573741018772125, + "rewards/reward_fn/mean": 2.588132381439209, + "rewards/reward_fn/std": 0.3573741316795349, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 179.90625, + "completions/mean_terminated_length": 179.90625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.04190092288108624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.05105016054585576, + "learning_rate": 7.8424e-06, + "loss": 0.0476, + "num_tokens": 18236570.0, + "reward": 2.7826709747314453, + "reward_std": 0.04271591454744339, + "rewards/reward_fn/mean": 2.7826709747314453, + "rewards/reward_fn/std": 0.04271586239337921, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.042007001166861145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.023414030205458403, + "learning_rate": 7.841999999999999e-06, + "loss": 0.0183, + "num_tokens": 18276303.0, + "reward": 2.849137544631958, + "reward_std": 0.30862998962402344, + "rewards/reward_fn/mean": 2.849137544631958, + "rewards/reward_fn/std": 0.30862998962402344, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 131.1875, + "completions/mean_terminated_length": 131.1875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.042113079452636044, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.029712033225223422, + "learning_rate": 7.8416e-06, + "loss": -0.0091, + "num_tokens": 18314901.0, + "reward": 3.9370594024658203, + "reward_std": 0.2479006052017212, + "rewards/reward_fn/mean": 3.9370594024658203, + "rewards/reward_fn/std": 0.24790059030056, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 583.15625, + "completions/mean_terminated_length": 535.9031982421875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.04221915773841095, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.024179237661883235, + "learning_rate": 7.841199999999999e-06, + "loss": 0.2692, + "num_tokens": 18366682.0, + "reward": 2.734818935394287, + "reward_std": 0.5017510056495667, + "rewards/reward_fn/mean": 2.734818935394287, + "rewards/reward_fn/std": 0.5017510056495667, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1412.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 448.9375, + "completions/mean_terminated_length": 448.9375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.04232523602418585, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.021006060764193535, + "learning_rate": 7.8408e-06, + "loss": -0.0214, + "num_tokens": 18432152.0, + "reward": 3.894382953643799, + "reward_std": 0.33419135212898254, + "rewards/reward_fn/mean": 3.894382953643799, + "rewards/reward_fn/std": 0.33419132232666016, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 788.9375, + "completions/mean_terminated_length": 788.9375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.04243131430996075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.01675853121560067, + "learning_rate": 7.840399999999999e-06, + "loss": -0.0367, + "num_tokens": 18497974.0, + "reward": 2.990764856338501, + "reward_std": 0.4567594826221466, + "rewards/reward_fn/mean": 2.990764856338501, + "rewards/reward_fn/std": 0.4567594528198242, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 191.0625, + "completions/mean_terminated_length": 191.0625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.042537392595735656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.03011680906638503, + "learning_rate": 7.84e-06, + "loss": 0.0012, + "num_tokens": 18556440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 357.9375, + "completions/mean_terminated_length": 357.9375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.042643470881510555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.021929903188720345, + "learning_rate": 7.8396e-06, + "loss": 0.0267, + "num_tokens": 18602006.0, + "reward": 3.1588587760925293, + "reward_std": 0.5363696217536926, + "rewards/reward_fn/mean": 3.1588587760925293, + "rewards/reward_fn/std": 0.5363695621490479, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1226.0, + "completions/max_terminated_length": 1226.0, + "completions/mean_length": 370.78125, + "completions/mean_terminated_length": 370.78125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.042749549167285454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.023676603566855192, + "learning_rate": 7.8392e-06, + "loss": -0.0388, + "num_tokens": 18650863.0, + "reward": 2.7991724014282227, + "reward_std": 0.27458456158638, + "rewards/reward_fn/mean": 2.7991724014282227, + "rewards/reward_fn/std": 0.2745845317840576, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 664.1875, + "completions/mean_terminated_length": 664.1875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.04285562745306036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.017545952810905874, + "learning_rate": 7.8388e-06, + "loss": 0.0957, + "num_tokens": 18707061.0, + "reward": 2.512760877609253, + "reward_std": 0.5735504031181335, + "rewards/reward_fn/mean": 2.512760877609253, + "rewards/reward_fn/std": 0.5735504031181335, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 189.5625, + "completions/mean_terminated_length": 189.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.04296170573883526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.030075914692133665, + "learning_rate": 7.8384e-06, + "loss": 0.0012, + "num_tokens": 18759783.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 341.96875, + "completions/mean_terminated_length": 341.96875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.04306778402461016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.02385372808203101, + "learning_rate": 7.838e-06, + "loss": 0.0794, + "num_tokens": 18786086.0, + "reward": 3.8906993865966797, + "reward_std": 0.34537699818611145, + "rewards/reward_fn/mean": 3.8906993865966797, + "rewards/reward_fn/std": 0.34537696838378906, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.043173862310385065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.03006741451099515, + "learning_rate": 7.8376e-06, + "loss": -0.0234, + "num_tokens": 18831286.0, + "reward": 3.787572145462036, + "reward_std": 0.4494977593421936, + "rewards/reward_fn/mean": 3.787572145462036, + "rewards/reward_fn/std": 0.4494977295398712, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 529.125, + "completions/mean_terminated_length": 480.1290283203125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.043279940596159965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0166288634063676, + "learning_rate": 7.8372e-06, + "loss": 0.2106, + "num_tokens": 18929114.0, + "reward": 3.8349366188049316, + "reward_std": 0.735542893409729, + "rewards/reward_fn/mean": 3.8349366188049316, + "rewards/reward_fn/std": 0.7355428338050842, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 156.15625, + "completions/mean_terminated_length": 156.15625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.04338601888193487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.024419703288003802, + "learning_rate": 7.8368e-06, + "loss": 0.001, + "num_tokens": 18952959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 218.375, + "completions/mean_terminated_length": 218.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.04349209716770977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.033032523933798075, + "learning_rate": 7.8364e-06, + "loss": -0.0199, + "num_tokens": 18995531.0, + "reward": 3.660304069519043, + "reward_std": 0.5844630002975464, + "rewards/reward_fn/mean": 3.660304069519043, + "rewards/reward_fn/std": 0.5844630002975464, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 290.40625, + "completions/mean_terminated_length": 290.40625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.04359817545348467, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.028384562116116285, + "learning_rate": 7.836e-06, + "loss": 0.0034, + "num_tokens": 19039832.0, + "reward": 3.966031551361084, + "reward_std": 0.19215430319309235, + "rewards/reward_fn/mean": 3.966031551361084, + "rewards/reward_fn/std": 0.19215430319309235, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 73.25, + "completions/mean_terminated_length": 73.25, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.043704253739259576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.024869739543646574, + "learning_rate": 7.8356e-06, + "loss": 0.001, + "num_tokens": 19073664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 566.1875, + "completions/mean_terminated_length": 518.3870849609375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.043810332025034475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.029686052352190018, + "learning_rate": 7.8352e-06, + "loss": 0.2334, + "num_tokens": 19127142.0, + "reward": 2.6168479919433594, + "reward_std": 0.543907880783081, + "rewards/reward_fn/mean": 2.6168479919433594, + "rewards/reward_fn/std": 0.5439079403877258, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 143.03125, + "completions/mean_terminated_length": 143.03125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.043916410310809374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.03072983492165804, + "learning_rate": 7.8348e-06, + "loss": 0.0012, + "num_tokens": 19166471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 170.53125, + "completions/mean_terminated_length": 170.53125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.04402248859658428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.025355301331728697, + "learning_rate": 7.834399999999999e-06, + "loss": 0.001, + "num_tokens": 19204504.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 242.65625, + "completions/mean_terminated_length": 242.65625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04412856688235918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.024939125403761864, + "learning_rate": 7.834e-06, + "loss": 0.0628, + "num_tokens": 19252557.0, + "reward": 2.96942138671875, + "reward_std": 0.08691102266311646, + "rewards/reward_fn/mean": 2.96942138671875, + "rewards/reward_fn/std": 0.08691102266311646, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 344.25, + "completions/mean_terminated_length": 344.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.044234645168134086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.02124662697315216, + "learning_rate": 7.833599999999999e-06, + "loss": -0.0249, + "num_tokens": 19302485.0, + "reward": 1.736589789390564, + "reward_std": 0.022552739828824997, + "rewards/reward_fn/mean": 1.736589789390564, + "rewards/reward_fn/std": 0.022552751004695892, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 296.3125, + "completions/mean_terminated_length": 296.3125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.044340723453908985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.025449416134506464, + "learning_rate": 7.8332e-06, + "loss": 0.0491, + "num_tokens": 19347167.0, + "reward": 3.7305917739868164, + "reward_std": 0.5530329942703247, + "rewards/reward_fn/mean": 3.7305917739868164, + "rewards/reward_fn/std": 0.5530329942703247, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 293.28125, + "completions/mean_terminated_length": 293.28125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.044446801739683885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.018630424048751593, + "learning_rate": 7.832799999999999e-06, + "loss": 0.0007, + "num_tokens": 19389608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 239.65625, + "completions/mean_terminated_length": 239.65625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.04455288002545879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.02400025725364685, + "learning_rate": 7.8324e-06, + "loss": 0.001, + "num_tokens": 19432349.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 171.9375, + "completions/mean_terminated_length": 171.9375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.04465895831123369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1923828125, + "kl": 0.04056114191189408, + "learning_rate": 7.831999999999999e-06, + "loss": 0.0016, + "num_tokens": 19462107.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 238.71875, + "completions/mean_terminated_length": 238.71875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04476503659700859, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.02605710015632212, + "learning_rate": 7.8316e-06, + "loss": 0.1752, + "num_tokens": 19518066.0, + "reward": 3.8169188499450684, + "reward_std": 0.602931559085846, + "rewards/reward_fn/mean": 3.8169188499450684, + "rewards/reward_fn/std": 0.602931559085846, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 184.21875, + "completions/mean_terminated_length": 184.21875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.044871114882783496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.02194814942777157, + "learning_rate": 7.831199999999999e-06, + "loss": 0.0009, + "num_tokens": 19562905.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 260.3125, + "completions/mean_terminated_length": 260.3125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.044977193168558395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.02712744101881981, + "learning_rate": 7.8308e-06, + "loss": -0.0232, + "num_tokens": 19615107.0, + "reward": 3.930159091949463, + "reward_std": 0.39508044719696045, + "rewards/reward_fn/mean": 3.930159091949463, + "rewards/reward_fn/std": 0.39508041739463806, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1212.0, + "completions/mean_length": 549.90625, + "completions/mean_terminated_length": 501.58062744140625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.0450832714543333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.021885179448872805, + "learning_rate": 7.8304e-06, + "loss": 0.1124, + "num_tokens": 19667232.0, + "reward": 3.4282619953155518, + "reward_std": 0.8545926809310913, + "rewards/reward_fn/mean": 3.4282619953155518, + "rewards/reward_fn/std": 0.8545926213264465, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 119.40625, + "completions/mean_terminated_length": 119.40625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.0451893497401082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.01785171974916011, + "learning_rate": 7.83e-06, + "loss": 0.0007, + "num_tokens": 19715853.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 403.90625, + "completions/mean_terminated_length": 403.90625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.0452954280258831, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.022983923787251115, + "learning_rate": 7.8296e-06, + "loss": -0.0171, + "num_tokens": 19759690.0, + "reward": 2.5151357650756836, + "reward_std": 0.5577123761177063, + "rewards/reward_fn/mean": 2.5151357650756836, + "rewards/reward_fn/std": 0.5577123165130615, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.045401506311658006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.029623025562614202, + "learning_rate": 7.8292e-06, + "loss": -0.1286, + "num_tokens": 19800661.0, + "reward": 3.7741332054138184, + "reward_std": 0.7420908808708191, + "rewards/reward_fn/mean": 3.7741332054138184, + "rewards/reward_fn/std": 0.7420908808708191, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 285.15625, + "completions/mean_terminated_length": 285.15625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.045507584597432905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.02174567524343729, + "learning_rate": 7.8288e-06, + "loss": 0.0009, + "num_tokens": 19856442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 676.75, + "completions/mean_terminated_length": 676.75, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.045613662883207805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.019659652840346098, + "learning_rate": 7.8284e-06, + "loss": 0.0379, + "num_tokens": 19922322.0, + "reward": 2.3661680221557617, + "reward_std": 0.5572351217269897, + "rewards/reward_fn/mean": 2.3661680221557617, + "rewards/reward_fn/std": 0.5572351813316345, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 362.625, + "completions/mean_terminated_length": 362.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.04571974116898271, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.025779376970604062, + "learning_rate": 7.828e-06, + "loss": -0.0389, + "num_tokens": 19969542.0, + "reward": 2.871279239654541, + "reward_std": 1.0416733026504517, + "rewards/reward_fn/mean": 2.871279239654541, + "rewards/reward_fn/std": 1.0416733026504517, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 303.84375, + "completions/mean_terminated_length": 303.84375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.04582581945475761, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.019519688561558723, + "learning_rate": 7.8276e-06, + "loss": 0.0645, + "num_tokens": 19999745.0, + "reward": 3.9620537757873535, + "reward_std": 0.21465659141540527, + "rewards/reward_fn/mean": 3.9620537757873535, + "rewards/reward_fn/std": 0.21465659141540527, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 399.125, + "completions/mean_terminated_length": 399.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.04593189774053251, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.018953290185891092, + "learning_rate": 7.8272e-06, + "loss": 0.0483, + "num_tokens": 20061317.0, + "reward": 2.9752299785614014, + "reward_std": 0.07937107980251312, + "rewards/reward_fn/mean": 2.9752299785614014, + "rewards/reward_fn/std": 0.07937107235193253, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 119.875, + "completions/mean_terminated_length": 119.875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.046037976026307416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.027189034270122647, + "learning_rate": 7.8268e-06, + "loss": 0.0011, + "num_tokens": 20088545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 295.03125, + "completions/mean_terminated_length": 295.03125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.046144054312082315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.026012770365923643, + "learning_rate": 7.826399999999998e-06, + "loss": 0.0249, + "num_tokens": 20128834.0, + "reward": 2.9719934463500977, + "reward_std": 0.24174726009368896, + "rewards/reward_fn/mean": 2.9719934463500977, + "rewards/reward_fn/std": 0.24174723029136658, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 432.125, + "completions/mean_terminated_length": 432.125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.04625013259785722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.01361775363329798, + "learning_rate": 7.826e-06, + "loss": -0.0077, + "num_tokens": 20186982.0, + "reward": 2.8467493057250977, + "reward_std": 0.28382164239883423, + "rewards/reward_fn/mean": 2.8467493057250977, + "rewards/reward_fn/std": 0.28382164239883423, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 257.0625, + "completions/mean_terminated_length": 257.0625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04635621088363212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.023967791348695755, + "learning_rate": 7.8256e-06, + "loss": 0.0345, + "num_tokens": 20246248.0, + "reward": 3.7507588863372803, + "reward_std": 0.47892749309539795, + "rewards/reward_fn/mean": 3.7507588863372803, + "rewards/reward_fn/std": 0.47892752289772034, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 285.9375, + "completions/mean_terminated_length": 285.9375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.04646228916940702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.02458371128886938, + "learning_rate": 7.8252e-06, + "loss": -0.0299, + "num_tokens": 20291078.0, + "reward": 3.627525806427002, + "reward_std": 0.4593088626861572, + "rewards/reward_fn/mean": 3.627525806427002, + "rewards/reward_fn/std": 0.45930883288383484, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 258.09375, + "completions/mean_terminated_length": 258.09375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.046568367455181926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.01905307243578136, + "learning_rate": 7.8248e-06, + "loss": -0.039, + "num_tokens": 20332777.0, + "reward": 3.9657504558563232, + "reward_std": 0.19374500215053558, + "rewards/reward_fn/mean": 3.9657504558563232, + "rewards/reward_fn/std": 0.19374501705169678, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 180.625, + "completions/mean_terminated_length": 180.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.046674445740956826, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.021813688217662275, + "learning_rate": 7.824399999999999e-06, + "loss": -0.0704, + "num_tokens": 20378941.0, + "reward": 3.5305161476135254, + "reward_std": 0.44998371601104736, + "rewards/reward_fn/mean": 3.5305161476135254, + "rewards/reward_fn/std": 0.449983686208725, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 221.03125, + "completions/mean_terminated_length": 221.03125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.046780524026731725, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.027017208514735103, + "learning_rate": 7.824e-06, + "loss": 0.0379, + "num_tokens": 20422366.0, + "reward": 3.9637999534606934, + "reward_std": 0.2047785222530365, + "rewards/reward_fn/mean": 3.9637999534606934, + "rewards/reward_fn/std": 0.2047785222530365, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.04688660231250663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.025995554169639945, + "learning_rate": 7.823599999999999e-06, + "loss": 0.001, + "num_tokens": 20464226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 353.53125, + "completions/mean_terminated_length": 353.53125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.04699268059828153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.023292170371860266, + "learning_rate": 7.8232e-06, + "loss": 0.0398, + "num_tokens": 20510323.0, + "reward": 3.5015358924865723, + "reward_std": 0.5767775177955627, + "rewards/reward_fn/mean": 3.5015358924865723, + "rewards/reward_fn/std": 0.5767775177955627, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 313.78125, + "completions/mean_terminated_length": 313.78125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.04709875888405644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.019186518737114966, + "learning_rate": 7.822799999999999e-06, + "loss": 0.0008, + "num_tokens": 20552812.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 226.75, + "completions/mean_terminated_length": 226.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.047204837169831336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.0306410426273942, + "learning_rate": 7.8224e-06, + "loss": 0.0012, + "num_tokens": 20592644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 241.59375, + "completions/mean_terminated_length": 241.59375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.047310915455606235, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.02532052854076028, + "learning_rate": 7.821999999999999e-06, + "loss": 0.1193, + "num_tokens": 20617431.0, + "reward": 2.9998433589935303, + "reward_std": 0.0696294903755188, + "rewards/reward_fn/mean": 2.9998433589935303, + "rewards/reward_fn/std": 0.0696294978260994, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 453.90625, + "completions/mean_terminated_length": 402.4838562011719, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.04741699374138114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.016588653321377933, + "learning_rate": 7.8216e-06, + "loss": 0.1984, + "num_tokens": 20653140.0, + "reward": 2.85459041595459, + "reward_std": 0.6958485841751099, + "rewards/reward_fn/mean": 2.85459041595459, + "rewards/reward_fn/std": 0.6958485841751099, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 184.4375, + "completions/mean_terminated_length": 184.4375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.04752307202715604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.019108422100543976, + "learning_rate": 7.8212e-06, + "loss": 0.0008, + "num_tokens": 20698146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 247.53125, + "completions/mean_terminated_length": 247.53125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.04762915031293094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.02202951116487384, + "learning_rate": 7.8208e-06, + "loss": 0.0009, + "num_tokens": 20758035.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 123.84375, + "completions/mean_terminated_length": 123.84375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.047735228598705846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.03342678747139871, + "learning_rate": 7.8204e-06, + "loss": 0.0013, + "num_tokens": 20794574.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 123.25, + "completions/mean_terminated_length": 123.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.047841306884480746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.02307695336639881, + "learning_rate": 7.82e-06, + "loss": 0.0009, + "num_tokens": 20839446.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 356.28125, + "completions/mean_terminated_length": 356.28125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.04794738517025565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.021392826223745942, + "learning_rate": 7.8196e-06, + "loss": 0.0044, + "num_tokens": 20888191.0, + "reward": 3.5172977447509766, + "reward_std": 0.7753545045852661, + "rewards/reward_fn/mean": 3.5172977447509766, + "rewards/reward_fn/std": 0.7753545045852661, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 220.09375, + "completions/mean_terminated_length": 220.09375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.04805346345603055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.029516591923311353, + "learning_rate": 7.8192e-06, + "loss": 0.0179, + "num_tokens": 20912642.0, + "reward": 3.9728949069976807, + "reward_std": 0.15332958102226257, + "rewards/reward_fn/mean": 3.9728949069976807, + "rewards/reward_fn/std": 0.15332959592342377, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 210.84375, + "completions/mean_terminated_length": 210.84375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.04815954174180545, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.031340898014605045, + "learning_rate": 7.8188e-06, + "loss": 0.2922, + "num_tokens": 20958333.0, + "reward": 3.886120557785034, + "reward_std": 0.4653671979904175, + "rewards/reward_fn/mean": 3.886120557785034, + "rewards/reward_fn/std": 0.4653671979904175, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 188.5625, + "completions/mean_terminated_length": 188.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.04826562002758036, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.02241483051329851, + "learning_rate": 7.8184e-06, + "loss": -0.0742, + "num_tokens": 20995407.0, + "reward": 2.9604923725128174, + "reward_std": 0.46088868379592896, + "rewards/reward_fn/mean": 2.9604923725128174, + "rewards/reward_fn/std": 0.46088865399360657, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 172.625, + "completions/mean_terminated_length": 172.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.048371698313355256, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.021063596475869417, + "learning_rate": 7.817999999999999e-06, + "loss": 0.0673, + "num_tokens": 21048995.0, + "reward": 3.8552751541137695, + "reward_std": 0.3901585340499878, + "rewards/reward_fn/mean": 3.8552751541137695, + "rewards/reward_fn/std": 0.3901585042476654, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 152.8125, + "completions/mean_terminated_length": 152.8125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.048477776599130155, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.02626768359914422, + "learning_rate": 7.8176e-06, + "loss": 0.0393, + "num_tokens": 21088477.0, + "reward": 2.950690746307373, + "reward_std": 0.06558680534362793, + "rewards/reward_fn/mean": 2.950690746307373, + "rewards/reward_fn/std": 0.06558679044246674, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 729.9375, + "completions/mean_terminated_length": 642.0667114257812, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.04858385488490506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.022620241856202483, + "learning_rate": 7.817199999999999e-06, + "loss": 0.3226, + "num_tokens": 21145979.0, + "reward": 2.522402286529541, + "reward_std": 0.7907276749610901, + "rewards/reward_fn/mean": 2.522402286529541, + "rewards/reward_fn/std": 0.7907276749610901, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.04868993317067996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.029362128349021077, + "learning_rate": 7.8168e-06, + "loss": -0.0061, + "num_tokens": 21197503.0, + "reward": 3.963871479034424, + "reward_std": 0.20437325537204742, + "rewards/reward_fn/mean": 3.963871479034424, + "rewards/reward_fn/std": 0.20437327027320862, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 252.65625, + "completions/mean_terminated_length": 252.65625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.04879601145645486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.025767018785700202, + "learning_rate": 7.8164e-06, + "loss": 0.001, + "num_tokens": 21244596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1352.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 299.65625, + "completions/mean_terminated_length": 299.65625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.048902089742229767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.024106395663693547, + "learning_rate": 7.816e-06, + "loss": 0.001, + "num_tokens": 21296137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 304.3125, + "completions/mean_terminated_length": 304.3125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.049008168028004666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.026658632792532444, + "learning_rate": 7.8156e-06, + "loss": -0.0012, + "num_tokens": 21329907.0, + "reward": 2.8445465564727783, + "reward_std": 0.2993001639842987, + "rewards/reward_fn/mean": 2.8445465564727783, + "rewards/reward_fn/std": 0.2993001937866211, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 521.0, + "completions/mean_terminated_length": 471.7419128417969, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.04911424631377957, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.02180269779637456, + "learning_rate": 7.8152e-06, + "loss": 0.2696, + "num_tokens": 21368947.0, + "reward": 2.6146130561828613, + "reward_std": 0.5501555800437927, + "rewards/reward_fn/mean": 2.6146130561828613, + "rewards/reward_fn/std": 0.550155520439148, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1207.0, + "completions/max_terminated_length": 1207.0, + "completions/mean_length": 392.78125, + "completions/mean_terminated_length": 392.78125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.04922032459955447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.024015987291932106, + "learning_rate": 7.8148e-06, + "loss": 0.086, + "num_tokens": 21417260.0, + "reward": 2.8569464683532715, + "reward_std": 0.05871182680130005, + "rewards/reward_fn/mean": 2.8569464683532715, + "rewards/reward_fn/std": 0.058711789548397064, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 618.625, + "completions/mean_terminated_length": 523.3333740234375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.04932640288532937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.01614295574836433, + "learning_rate": 7.8144e-06, + "loss": 0.3621, + "num_tokens": 21479552.0, + "reward": 3.6563806533813477, + "reward_std": 0.9594557881355286, + "rewards/reward_fn/mean": 3.6563806533813477, + "rewards/reward_fn/std": 0.9594557881355286, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 319.65625, + "completions/mean_terminated_length": 319.65625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.04943248117110428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.01879534707404673, + "learning_rate": 7.814e-06, + "loss": 0.0444, + "num_tokens": 21522613.0, + "reward": 1.6788173913955688, + "reward_std": 0.026051480323076248, + "rewards/reward_fn/mean": 1.6788173913955688, + "rewards/reward_fn/std": 0.026051471009850502, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 98.46875, + "completions/mean_terminated_length": 98.46875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.049538559456879176, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.028762757312506437, + "learning_rate": 7.8136e-06, + "loss": -0.0682, + "num_tokens": 21563364.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 643.59375, + "completions/mean_terminated_length": 643.59375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.049644637742654076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.030355968279764056, + "learning_rate": 7.8132e-06, + "loss": -0.0552, + "num_tokens": 21614039.0, + "reward": 2.8546223640441895, + "reward_std": 0.9961849451065063, + "rewards/reward_fn/mean": 2.8546223640441895, + "rewards/reward_fn/std": 0.9961848855018616, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 219.9375, + "completions/mean_terminated_length": 219.9375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.04975071602842898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.024273998336866498, + "learning_rate": 7.812799999999999e-06, + "loss": 0.0575, + "num_tokens": 21655349.0, + "reward": 3.557422161102295, + "reward_std": 0.5809412598609924, + "rewards/reward_fn/mean": 3.557422161102295, + "rewards/reward_fn/std": 0.5809412002563477, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.04985679431420388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.02798668248578906, + "learning_rate": 7.8124e-06, + "loss": -0.168, + "num_tokens": 21694585.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 497.59375, + "completions/mean_terminated_length": 447.58062744140625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.04996287259997879, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.017523658694699407, + "learning_rate": 7.812e-06, + "loss": 0.2898, + "num_tokens": 21750412.0, + "reward": 2.634608268737793, + "reward_std": 0.5835399031639099, + "rewards/reward_fn/mean": 2.634608268737793, + "rewards/reward_fn/std": 0.5835399031639099, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 279.34375, + "completions/mean_terminated_length": 279.34375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.05006895088575369, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.027581357397139072, + "learning_rate": 7.8116e-06, + "loss": 0.0572, + "num_tokens": 21807479.0, + "reward": 3.4291439056396484, + "reward_std": 0.8019877076148987, + "rewards/reward_fn/mean": 3.4291439056396484, + "rewards/reward_fn/std": 0.8019877076148987, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 190.90625, + "completions/mean_terminated_length": 190.90625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.050175029171528586, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.040224543772637844, + "learning_rate": 7.8112e-06, + "loss": 0.001, + "num_tokens": 21854292.0, + "reward": 3.567188262939453, + "reward_std": 0.5695129632949829, + "rewards/reward_fn/mean": 3.567188262939453, + "rewards/reward_fn/std": 0.5695129036903381, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 204.84375, + "completions/mean_terminated_length": 204.84375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.05028110745730349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.029980882070958614, + "learning_rate": 7.8108e-06, + "loss": 0.0012, + "num_tokens": 21899471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 113.125, + "completions/mean_terminated_length": 113.125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.05038718574307839, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.028396222507581115, + "learning_rate": 7.810399999999999e-06, + "loss": -0.0355, + "num_tokens": 21924563.0, + "reward": 3.928886651992798, + "reward_std": 0.40227818489074707, + "rewards/reward_fn/mean": 3.928886651992798, + "rewards/reward_fn/std": 0.40227818489074707, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 334.0625, + "completions/mean_terminated_length": 334.0625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.05049326402885329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.020257814088836312, + "learning_rate": 7.81e-06, + "loss": 0.0086, + "num_tokens": 21961941.0, + "reward": 2.9945178031921387, + "reward_std": 0.0230459775775671, + "rewards/reward_fn/mean": 2.9945178031921387, + "rewards/reward_fn/std": 0.02304600365459919, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 158.6875, + "completions/mean_terminated_length": 158.6875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0505993423146282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.0344227678142488, + "learning_rate": 7.809599999999999e-06, + "loss": -0.0361, + "num_tokens": 22013227.0, + "reward": 3.5052218437194824, + "reward_std": 0.6092379689216614, + "rewards/reward_fn/mean": 3.5052218437194824, + "rewards/reward_fn/std": 0.6092379689216614, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 168.21875, + "completions/mean_terminated_length": 168.21875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.050705420600403096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.185546875, + "kl": 0.028984917444176972, + "learning_rate": 7.8092e-06, + "loss": 0.0012, + "num_tokens": 22053618.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 286.9375, + "completions/mean_terminated_length": 286.9375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.050811498886178, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.022939151618629694, + "learning_rate": 7.808799999999999e-06, + "loss": -0.0067, + "num_tokens": 22097488.0, + "reward": 2.780938148498535, + "reward_std": 0.5519546866416931, + "rewards/reward_fn/mean": 2.780938148498535, + "rewards/reward_fn/std": 0.5519546866416931, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 172.59375, + "completions/mean_terminated_length": 172.59375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0509175771719529, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.028172635240480304, + "learning_rate": 7.8084e-06, + "loss": -0.0612, + "num_tokens": 22147619.0, + "reward": 3.802846670150757, + "reward_std": 0.3469943106174469, + "rewards/reward_fn/mean": 3.802846670150757, + "rewards/reward_fn/std": 0.3469943106174469, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 185.96875, + "completions/mean_terminated_length": 185.96875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.0510236554577278, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.02581855608150363, + "learning_rate": 7.807999999999999e-06, + "loss": -0.016, + "num_tokens": 22184738.0, + "reward": 3.9292778968811035, + "reward_std": 0.27909815311431885, + "rewards/reward_fn/mean": 3.9292778968811035, + "rewards/reward_fn/std": 0.27909815311431885, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 219.59375, + "completions/mean_terminated_length": 219.59375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.05112973374350271, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.023512960644438863, + "learning_rate": 7.8076e-06, + "loss": 0.0226, + "num_tokens": 22225749.0, + "reward": 3.501314640045166, + "reward_std": 0.5752494931221008, + "rewards/reward_fn/mean": 3.501314640045166, + "rewards/reward_fn/std": 0.575249433517456, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 269.03125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.05123581202927761, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.02642969344742596, + "learning_rate": 7.8072e-06, + "loss": -0.0279, + "num_tokens": 22272534.0, + "reward": 1.7679412364959717, + "reward_std": 0.03526504710316658, + "rewards/reward_fn/mean": 1.7679412364959717, + "rewards/reward_fn/std": 0.03526502102613449, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 243.46875, + "completions/mean_terminated_length": 243.46875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.051341890315052506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.018053250038065016, + "learning_rate": 7.8068e-06, + "loss": 0.0794, + "num_tokens": 22319717.0, + "reward": 3.0779318809509277, + "reward_std": 0.24998284876346588, + "rewards/reward_fn/mean": 3.0779318809509277, + "rewards/reward_fn/std": 0.2499828338623047, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 285.96875, + "completions/mean_terminated_length": 285.96875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.05144796860082741, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.026710799895226955, + "learning_rate": 7.8064e-06, + "loss": -0.127, + "num_tokens": 22362308.0, + "reward": 2.9080183506011963, + "reward_std": 0.29254910349845886, + "rewards/reward_fn/mean": 2.9080183506011963, + "rewards/reward_fn/std": 0.29254910349845886, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 370.875, + "completions/mean_terminated_length": 370.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.05155404688660231, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.0195961135905236, + "learning_rate": 7.806e-06, + "loss": -0.0933, + "num_tokens": 22407808.0, + "reward": 3.541620969772339, + "reward_std": 0.730574369430542, + "rewards/reward_fn/mean": 3.541620969772339, + "rewards/reward_fn/std": 0.7305744290351868, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.05166012517237722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.02821559482254088, + "learning_rate": 7.8056e-06, + "loss": 0.0011, + "num_tokens": 22448063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 575.125, + "completions/mean_terminated_length": 527.6129150390625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.05176620345815212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.014755380223505199, + "learning_rate": 7.8052e-06, + "loss": 0.201, + "num_tokens": 22517571.0, + "reward": 3.101287841796875, + "reward_std": 0.8764511942863464, + "rewards/reward_fn/mean": 3.101287841796875, + "rewards/reward_fn/std": 0.8764511942863464, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 599.1875, + "completions/mean_terminated_length": 599.1875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.051872281743927016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.022309450898319483, + "learning_rate": 7.8048e-06, + "loss": 0.0696, + "num_tokens": 22571081.0, + "reward": 2.6842198371887207, + "reward_std": 0.4956829249858856, + "rewards/reward_fn/mean": 2.6842198371887207, + "rewards/reward_fn/std": 0.49568289518356323, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 440.71875, + "completions/mean_terminated_length": 440.71875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.05197836002970192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.011530151590704918, + "learning_rate": 7.8044e-06, + "loss": -0.089, + "num_tokens": 22635232.0, + "reward": 2.738710641860962, + "reward_std": 0.24050787091255188, + "rewards/reward_fn/mean": 2.738710641860962, + "rewards/reward_fn/std": 0.24050785601139069, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1725.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 503.53125, + "completions/mean_terminated_length": 503.53125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.05208443831547682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.023490125546231866, + "learning_rate": 7.804e-06, + "loss": 0.0417, + "num_tokens": 22692849.0, + "reward": 3.4488565921783447, + "reward_std": 0.6377933025360107, + "rewards/reward_fn/mean": 3.4488565921783447, + "rewards/reward_fn/std": 0.6377933025360107, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1144.0, + "completions/max_terminated_length": 1144.0, + "completions/mean_length": 233.1875, + "completions/mean_terminated_length": 233.1875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.05219051660125172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.023990701185539365, + "learning_rate": 7.8036e-06, + "loss": -0.0747, + "num_tokens": 22734231.0, + "reward": 2.9281232357025146, + "reward_std": 0.19461673498153687, + "rewards/reward_fn/mean": 2.9281232357025146, + "rewards/reward_fn/std": 0.19461672008037567, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 264.28125, + "completions/mean_terminated_length": 264.28125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.05229659488702663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.026221094885841012, + "learning_rate": 7.8032e-06, + "loss": 0.1836, + "num_tokens": 22778592.0, + "reward": 3.4923720359802246, + "reward_std": 0.5916131734848022, + "rewards/reward_fn/mean": 3.4923720359802246, + "rewards/reward_fn/std": 0.5916131734848022, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 197.25, + "completions/mean_terminated_length": 197.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.05240267317280153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.024377966998144984, + "learning_rate": 7.8028e-06, + "loss": 0.001, + "num_tokens": 22821800.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1664.0, + "completions/max_terminated_length": 1664.0, + "completions/mean_length": 389.84375, + "completions/mean_terminated_length": 389.84375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.052508751458576426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.03522437275387347, + "learning_rate": 7.8024e-06, + "loss": 0.0624, + "num_tokens": 22870019.0, + "reward": 3.2021644115448, + "reward_std": 0.43235763907432556, + "rewards/reward_fn/mean": 3.2021644115448, + "rewards/reward_fn/std": 0.4323575794696808, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 292.8125, + "completions/mean_terminated_length": 292.8125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.05261482974435133, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.01953095616772771, + "learning_rate": 7.802e-06, + "loss": 0.1153, + "num_tokens": 22925949.0, + "reward": 3.7166550159454346, + "reward_std": 0.6504583358764648, + "rewards/reward_fn/mean": 3.7166550159454346, + "rewards/reward_fn/std": 0.6504583358764648, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1207.0, + "completions/max_terminated_length": 1207.0, + "completions/mean_length": 581.03125, + "completions/mean_terminated_length": 581.03125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.05272090803012623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.015155259286984801, + "learning_rate": 7.8016e-06, + "loss": -0.0057, + "num_tokens": 22988254.0, + "reward": 2.6650097370147705, + "reward_std": 0.3541271388530731, + "rewards/reward_fn/mean": 2.6650097370147705, + "rewards/reward_fn/std": 0.3541271388530731, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 470.71875, + "completions/mean_terminated_length": 470.71875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.05282698631590114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.017473606974817812, + "learning_rate": 7.801199999999999e-06, + "loss": -0.0481, + "num_tokens": 23038293.0, + "reward": 2.4420838356018066, + "reward_std": 0.5715554356575012, + "rewards/reward_fn/mean": 2.4420838356018066, + "rewards/reward_fn/std": 0.5715554356575012, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 338.46875, + "completions/mean_terminated_length": 338.46875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.05293306460167604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.015421941527165473, + "learning_rate": 7.8008e-06, + "loss": 0.0006, + "num_tokens": 23075140.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.05303914288745094, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.01675526413600892, + "learning_rate": 7.800399999999999e-06, + "loss": 0.037, + "num_tokens": 23126026.0, + "reward": 3.928408145904541, + "reward_std": 0.4049839973449707, + "rewards/reward_fn/mean": 3.928408145904541, + "rewards/reward_fn/std": 0.4049839973449707, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 66.9375, + "completions/mean_terminated_length": 66.9375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.05314522117322584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.279296875, + "kl": 0.026731195161119103, + "learning_rate": 7.8e-06, + "loss": 0.0011, + "num_tokens": 23146344.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 440.5625, + "completions/mean_terminated_length": 440.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.05325129945900074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.018118804087862372, + "learning_rate": 7.799599999999999e-06, + "loss": 0.0598, + "num_tokens": 23201882.0, + "reward": 3.886624336242676, + "reward_std": 0.4708651900291443, + "rewards/reward_fn/mean": 3.886624336242676, + "rewards/reward_fn/std": 0.4708651900291443, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 301.1875, + "completions/mean_terminated_length": 301.1875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.05335737774477564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.02376817143522203, + "learning_rate": 7.7992e-06, + "loss": -0.0487, + "num_tokens": 23246368.0, + "reward": 3.6897573471069336, + "reward_std": 0.4686228930950165, + "rewards/reward_fn/mean": 3.6897573471069336, + "rewards/reward_fn/std": 0.4686228930950165, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 344.96875, + "completions/mean_terminated_length": 344.96875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.05346345603055055, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.025152756366878748, + "learning_rate": 7.798799999999999e-06, + "loss": 0.1383, + "num_tokens": 23291359.0, + "reward": 2.986790657043457, + "reward_std": 0.23401233553886414, + "rewards/reward_fn/mean": 2.986790657043457, + "rewards/reward_fn/std": 0.23401233553886414, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1796.0, + "completions/max_terminated_length": 1796.0, + "completions/mean_length": 484.1875, + "completions/mean_terminated_length": 484.1875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.05356953431632545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.017668452579528093, + "learning_rate": 7.7984e-06, + "loss": 0.0279, + "num_tokens": 23355333.0, + "reward": 2.622150421142578, + "reward_std": 0.31195494532585144, + "rewards/reward_fn/mean": 2.622150421142578, + "rewards/reward_fn/std": 0.31195491552352905, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1184.0, + "completions/max_terminated_length": 1184.0, + "completions/mean_length": 353.625, + "completions/mean_terminated_length": 353.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.05367561260210035, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.018400359200313687, + "learning_rate": 7.797999999999999e-06, + "loss": 0.0488, + "num_tokens": 23388185.0, + "reward": 3.707305431365967, + "reward_std": 0.7213976383209229, + "rewards/reward_fn/mean": 3.707305431365967, + "rewards/reward_fn/std": 0.7213976979255676, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 230.78125, + "completions/mean_terminated_length": 230.78125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.05378169088787525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.02609437541104853, + "learning_rate": 7.7976e-06, + "loss": -0.0298, + "num_tokens": 23435730.0, + "reward": 3.6243045330047607, + "reward_std": 0.5281786322593689, + "rewards/reward_fn/mean": 3.6243045330047607, + "rewards/reward_fn/std": 0.5281786322593689, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 315.96875, + "completions/mean_terminated_length": 260.0967712402344, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.05388776917365015, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.02170586190186441, + "learning_rate": 7.7972e-06, + "loss": 0.3208, + "num_tokens": 23483857.0, + "reward": 3.8347511291503906, + "reward_std": 0.7358342409133911, + "rewards/reward_fn/mean": 3.8347511291503906, + "rewards/reward_fn/std": 0.7358343005180359, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 293.3125, + "completions/mean_terminated_length": 293.3125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.05399384745942506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.0171760261291638, + "learning_rate": 7.7968e-06, + "loss": -0.0252, + "num_tokens": 23537883.0, + "reward": 3.928311824798584, + "reward_std": 0.4055293798446655, + "rewards/reward_fn/mean": 3.928311824798584, + "rewards/reward_fn/std": 0.4055293798446655, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 194.53125, + "completions/mean_terminated_length": 194.53125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.05409992574519996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.025215985951945186, + "learning_rate": 7.7964e-06, + "loss": 0.001, + "num_tokens": 23567724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 224.09375, + "completions/mean_terminated_length": 224.09375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.05420600403097486, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.025873176753520966, + "learning_rate": 7.796e-06, + "loss": -0.0319, + "num_tokens": 23617359.0, + "reward": 2.72564435005188, + "reward_std": 0.18606720864772797, + "rewards/reward_fn/mean": 2.72564435005188, + "rewards/reward_fn/std": 0.18606719374656677, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 221.46875, + "completions/mean_terminated_length": 221.46875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.05431208231674976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.02119889738969505, + "learning_rate": 7.7956e-06, + "loss": 0.0008, + "num_tokens": 23674782.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 275.71875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.05441816060252466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.019600946456193924, + "learning_rate": 7.7952e-06, + "loss": 0.0119, + "num_tokens": 23716373.0, + "reward": 3.9173965454101562, + "reward_std": 0.3250616788864136, + "rewards/reward_fn/mean": 3.9173965454101562, + "rewards/reward_fn/std": 0.3250616490840912, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 199.875, + "completions/mean_terminated_length": 199.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05452423888829957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.023369343602098525, + "learning_rate": 7.7948e-06, + "loss": 0.0009, + "num_tokens": 23751569.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 144.625, + "completions/mean_terminated_length": 144.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.05463031717407447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.024780564941465855, + "learning_rate": 7.7944e-06, + "loss": 0.001, + "num_tokens": 23795365.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 482.34375, + "completions/mean_terminated_length": 482.34375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.05473639545984937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.01738837524317205, + "learning_rate": 7.793999999999999e-06, + "loss": 0.0892, + "num_tokens": 23869648.0, + "reward": 3.2161927223205566, + "reward_std": 0.9148228764533997, + "rewards/reward_fn/mean": 3.2161927223205566, + "rewards/reward_fn/std": 0.9148228764533997, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 371.25, + "completions/mean_terminated_length": 371.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.05484247374562427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02530662529170513, + "learning_rate": 7.7936e-06, + "loss": -0.0051, + "num_tokens": 23911960.0, + "reward": 2.4564008712768555, + "reward_std": 0.7786185145378113, + "rewards/reward_fn/mean": 2.4564008712768555, + "rewards/reward_fn/std": 0.7786185145378113, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 99.0625, + "completions/mean_terminated_length": 99.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.05494855203139917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.203125, + "kl": 0.025294956751167774, + "learning_rate": 7.793199999999999e-06, + "loss": 0.001, + "num_tokens": 23950170.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.05505463031717407, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.01991723431274295, + "learning_rate": 7.7928e-06, + "loss": 0.0355, + "num_tokens": 23986654.0, + "reward": 3.9727840423583984, + "reward_std": 0.15395739674568176, + "rewards/reward_fn/mean": 3.9727840423583984, + "rewards/reward_fn/std": 0.15395741164684296, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 80.59375, + "completions/mean_terminated_length": 80.59375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.05516070860294898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2041015625, + "kl": 0.02751685946714133, + "learning_rate": 7.7924e-06, + "loss": 0.0011, + "num_tokens": 24022641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 288.15625, + "completions/mean_terminated_length": 288.15625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.05526678688872388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.01649426226504147, + "learning_rate": 7.792e-06, + "loss": 0.0007, + "num_tokens": 24068726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 201.03125, + "completions/mean_terminated_length": 201.03125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05537286517449878, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.024215756682679057, + "learning_rate": 7.7916e-06, + "loss": -0.0523, + "num_tokens": 24125495.0, + "reward": 3.6382832527160645, + "reward_std": 0.476779580116272, + "rewards/reward_fn/mean": 3.6382832527160645, + "rewards/reward_fn/std": 0.476779580116272, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 372.78125, + "completions/mean_terminated_length": 372.78125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.05547894346027368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.018172105425037444, + "learning_rate": 7.7912e-06, + "loss": 0.0154, + "num_tokens": 24186032.0, + "reward": 2.670097589492798, + "reward_std": 0.18027065694332123, + "rewards/reward_fn/mean": 2.670097589492798, + "rewards/reward_fn/std": 0.18027064204216003, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 239.28125, + "completions/mean_terminated_length": 239.28125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.05558502174604858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.024925056844949722, + "learning_rate": 7.7908e-06, + "loss": 0.0018, + "num_tokens": 24209209.0, + "reward": 3.8195762634277344, + "reward_std": 0.5922850966453552, + "rewards/reward_fn/mean": 3.8195762634277344, + "rewards/reward_fn/std": 0.5922850966453552, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 222.53125, + "completions/mean_terminated_length": 222.53125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.05569110003182349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.025476337876170874, + "learning_rate": 7.790399999999999e-06, + "loss": -0.0391, + "num_tokens": 24253034.0, + "reward": 3.9297561645507812, + "reward_std": 0.2772708833217621, + "rewards/reward_fn/mean": 3.9297561645507812, + "rewards/reward_fn/std": 0.2772708535194397, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 267.0625, + "completions/mean_terminated_length": 267.0625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.05579717831759839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.021522009512409568, + "learning_rate": 7.79e-06, + "loss": -0.0187, + "num_tokens": 24294476.0, + "reward": 2.7246017456054688, + "reward_std": 0.24783332645893097, + "rewards/reward_fn/mean": 2.7246017456054688, + "rewards/reward_fn/std": 0.24783335626125336, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 333.125, + "completions/mean_terminated_length": 333.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.05590325660337329, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.023421776480972767, + "learning_rate": 7.789599999999999e-06, + "loss": 0.1981, + "num_tokens": 24369616.0, + "reward": 3.8314743041992188, + "reward_std": 0.5596181154251099, + "rewards/reward_fn/mean": 3.8314743041992188, + "rewards/reward_fn/std": 0.5596181154251099, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 336.9375, + "completions/mean_terminated_length": 336.9375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.056009334889148193, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.018540258635766804, + "learning_rate": 7.7892e-06, + "loss": -0.0174, + "num_tokens": 24426990.0, + "reward": 3.9209604263305664, + "reward_std": 0.3111303448677063, + "rewards/reward_fn/mean": 3.9209604263305664, + "rewards/reward_fn/std": 0.3111303448677063, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 424.53125, + "completions/mean_terminated_length": 424.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.05611541317492309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.02311969199217856, + "learning_rate": 7.788799999999999e-06, + "loss": 0.0972, + "num_tokens": 24479199.0, + "reward": 2.616727828979492, + "reward_std": 0.3706587255001068, + "rewards/reward_fn/mean": 2.616727828979492, + "rewards/reward_fn/std": 0.3706587553024292, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 1286.0, + "completions/mean_length": 363.96875, + "completions/mean_terminated_length": 363.96875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.05622149146069799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.018128991941921413, + "learning_rate": 7.7884e-06, + "loss": -0.0755, + "num_tokens": 24525630.0, + "reward": 2.7659575939178467, + "reward_std": 0.31296306848526, + "rewards/reward_fn/mean": 2.7659575939178467, + "rewards/reward_fn/std": 0.3129630386829376, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 256.28125, + "completions/mean_terminated_length": 256.28125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.0563275697464729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.02169134363066405, + "learning_rate": 7.788e-06, + "loss": 0.0009, + "num_tokens": 24582535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.0564336480322478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.022156615275889635, + "learning_rate": 7.7876e-06, + "loss": -0.0788, + "num_tokens": 24625151.0, + "reward": 3.9418551921844482, + "reward_std": 0.22879938781261444, + "rewards/reward_fn/mean": 3.9418551921844482, + "rewards/reward_fn/std": 0.22879941761493683, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 192.71875, + "completions/mean_terminated_length": 192.71875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.056539726318022704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01747878734022379, + "learning_rate": 7.7872e-06, + "loss": -0.0088, + "num_tokens": 24663670.0, + "reward": 2.927187442779541, + "reward_std": 0.05285609886050224, + "rewards/reward_fn/mean": 2.927187442779541, + "rewards/reward_fn/std": 0.05285611376166344, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 286.15625, + "completions/mean_terminated_length": 286.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.0566458046037976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.021938928868621588, + "learning_rate": 7.7868e-06, + "loss": -0.0154, + "num_tokens": 24693819.0, + "reward": 2.5903778076171875, + "reward_std": 0.4148198962211609, + "rewards/reward_fn/mean": 2.5903778076171875, + "rewards/reward_fn/std": 0.4148198962211609, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.0567518828895725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.026313411304727197, + "learning_rate": 7.7864e-06, + "loss": -0.043, + "num_tokens": 24727163.0, + "reward": 3.9305410385131836, + "reward_std": 0.3929198384284973, + "rewards/reward_fn/mean": 3.9305410385131836, + "rewards/reward_fn/std": 0.3929198086261749, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 450.96875, + "completions/mean_terminated_length": 399.45159912109375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05685796117534741, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02474389597773552, + "learning_rate": 7.786e-06, + "loss": 0.295, + "num_tokens": 24807642.0, + "reward": 2.4844565391540527, + "reward_std": 0.7617939710617065, + "rewards/reward_fn/mean": 2.4844565391540527, + "rewards/reward_fn/std": 0.7617940306663513, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 279.125, + "completions/mean_terminated_length": 279.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.05696403946112231, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.023733343929052353, + "learning_rate": 7.785599999999999e-06, + "loss": 0.2354, + "num_tokens": 24864926.0, + "reward": 3.690593719482422, + "reward_std": 0.6516405344009399, + "rewards/reward_fn/mean": 3.690593719482422, + "rewards/reward_fn/std": 0.6516405344009399, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 249.25, + "completions/mean_terminated_length": 249.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.05707011774689721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.01965078490320593, + "learning_rate": 7.7852e-06, + "loss": 0.0008, + "num_tokens": 24893702.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.057176196032672114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0240073436871171, + "learning_rate": 7.784799999999999e-06, + "loss": -0.0035, + "num_tokens": 24948466.0, + "reward": 3.0717613697052, + "reward_std": 0.3055979013442993, + "rewards/reward_fn/mean": 3.0717613697052, + "rewards/reward_fn/std": 0.3055979013442993, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 420.9375, + "completions/mean_terminated_length": 420.9375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.05728227431844701, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.017085597850382328, + "learning_rate": 7.7844e-06, + "loss": -0.0404, + "num_tokens": 24999024.0, + "reward": 3.8227782249450684, + "reward_std": 0.5049712657928467, + "rewards/reward_fn/mean": 3.8227782249450684, + "rewards/reward_fn/std": 0.5049712061882019, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 429.375, + "completions/mean_terminated_length": 377.1612854003906, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.05738835260422192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.017597553320229053, + "learning_rate": 7.783999999999999e-06, + "loss": 0.2524, + "num_tokens": 25047324.0, + "reward": 3.8369994163513184, + "reward_std": 0.7323954701423645, + "rewards/reward_fn/mean": 3.8369994163513184, + "rewards/reward_fn/std": 0.7323954105377197, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1177.0, + "completions/max_terminated_length": 1177.0, + "completions/mean_length": 378.90625, + "completions/mean_terminated_length": 378.90625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.05749443088999682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.016290104715153575, + "learning_rate": 7.7836e-06, + "loss": 0.0425, + "num_tokens": 25098361.0, + "reward": 3.7207727432250977, + "reward_std": 0.5368377566337585, + "rewards/reward_fn/mean": 3.7207727432250977, + "rewards/reward_fn/std": 0.5368378162384033, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1419.0, + "completions/mean_length": 524.0, + "completions/mean_terminated_length": 474.83868408203125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.05760050917577172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.015481176087632775, + "learning_rate": 7.7832e-06, + "loss": 0.2563, + "num_tokens": 25159897.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 397.84375, + "completions/mean_terminated_length": 397.84375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.057706587461546624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.020055697415955365, + "learning_rate": 7.7828e-06, + "loss": 0.0396, + "num_tokens": 25186612.0, + "reward": 3.4372525215148926, + "reward_std": 0.9111435413360596, + "rewards/reward_fn/mean": 3.4372525215148926, + "rewards/reward_fn/std": 0.9111434817314148, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1547.0, + "completions/max_terminated_length": 1547.0, + "completions/mean_length": 402.5625, + "completions/mean_terminated_length": 402.5625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.05781266574732152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.01703974965494126, + "learning_rate": 7.7824e-06, + "loss": -0.0174, + "num_tokens": 25245926.0, + "reward": 2.1865077018737793, + "reward_std": 0.591599702835083, + "rewards/reward_fn/mean": 2.1865077018737793, + "rewards/reward_fn/std": 0.5915996432304382, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 129.03125, + "completions/mean_terminated_length": 129.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.05791874403309642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.021547950338572264, + "learning_rate": 7.782e-06, + "loss": 0.0009, + "num_tokens": 25293607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 135.34375, + "completions/mean_terminated_length": 135.34375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.05802482231887133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02250622259452939, + "learning_rate": 7.7816e-06, + "loss": 0.0009, + "num_tokens": 25337234.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 438.40625, + "completions/mean_terminated_length": 438.40625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.05813090060464623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.016708286479115486, + "learning_rate": 7.7812e-06, + "loss": -0.0201, + "num_tokens": 25370719.0, + "reward": 3.6040561199188232, + "reward_std": 0.6672210693359375, + "rewards/reward_fn/mean": 3.6040561199188232, + "rewards/reward_fn/std": 0.6672210693359375, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.05823697889042113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.022039557108655572, + "learning_rate": 7.7808e-06, + "loss": -0.0129, + "num_tokens": 25399588.0, + "reward": 3.9713706970214844, + "reward_std": 0.1619519293308258, + "rewards/reward_fn/mean": 3.9713706970214844, + "rewards/reward_fn/std": 0.1619519144296646, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 251.59375, + "completions/mean_terminated_length": 251.59375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.058343057176196034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.025676261633634567, + "learning_rate": 7.7804e-06, + "loss": 0.0033, + "num_tokens": 25442935.0, + "reward": 3.9251885414123535, + "reward_std": 0.42319679260253906, + "rewards/reward_fn/mean": 3.9251885414123535, + "rewards/reward_fn/std": 0.4231967329978943, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 318.1875, + "completions/mean_terminated_length": 318.1875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.05844913546197093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.01699168875347823, + "learning_rate": 7.78e-06, + "loss": 0.0007, + "num_tokens": 25508861.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 216.03125, + "completions/mean_terminated_length": 216.03125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.05855521374774584, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.023042756598442793, + "learning_rate": 7.7796e-06, + "loss": -0.101, + "num_tokens": 25546814.0, + "reward": 2.9139137268066406, + "reward_std": 0.2089153379201889, + "rewards/reward_fn/mean": 2.9139137268066406, + "rewards/reward_fn/std": 0.2089153528213501, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 324.71875, + "completions/mean_terminated_length": 324.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.05866129203352074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.021038109436631203, + "learning_rate": 7.7792e-06, + "loss": 0.0099, + "num_tokens": 25596405.0, + "reward": 2.8648502826690674, + "reward_std": 0.312248170375824, + "rewards/reward_fn/mean": 2.8648502826690674, + "rewards/reward_fn/std": 0.312248170375824, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 406.96875, + "completions/mean_terminated_length": 354.0322570800781, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.05876737031929564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.027637179009616375, + "learning_rate": 7.7788e-06, + "loss": 0.0696, + "num_tokens": 25645524.0, + "reward": 3.4637341499328613, + "reward_std": 0.9320629239082336, + "rewards/reward_fn/mean": 3.4637341499328613, + "rewards/reward_fn/std": 0.9320629239082336, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 337.375, + "completions/mean_terminated_length": 337.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.058873448605070544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.02197849005460739, + "learning_rate": 7.7784e-06, + "loss": -0.0888, + "num_tokens": 25689728.0, + "reward": 3.8957581520080566, + "reward_std": 0.3294965326786041, + "rewards/reward_fn/mean": 3.8957581520080566, + "rewards/reward_fn/std": 0.3294965624809265, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 230.8125, + "completions/mean_terminated_length": 230.8125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.05897952689084544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.029500858625397086, + "learning_rate": 7.777999999999999e-06, + "loss": 0.0233, + "num_tokens": 25739002.0, + "reward": 2.866637706756592, + "reward_std": 0.37486761808395386, + "rewards/reward_fn/mean": 2.866637706756592, + "rewards/reward_fn/std": 0.37486761808395386, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 222.09375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.05908560517662034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.02057086571585387, + "learning_rate": 7.7776e-06, + "loss": 0.0235, + "num_tokens": 25777245.0, + "reward": 2.855576515197754, + "reward_std": 0.3794987201690674, + "rewards/reward_fn/mean": 2.855576515197754, + "rewards/reward_fn/std": 0.37949874997138977, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 306.65625, + "completions/mean_terminated_length": 306.65625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.05919168346239525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.02790513075888157, + "learning_rate": 7.777199999999999e-06, + "loss": 0.0267, + "num_tokens": 25825586.0, + "reward": 3.3989100456237793, + "reward_std": 0.577396810054779, + "rewards/reward_fn/mean": 3.3989100456237793, + "rewards/reward_fn/std": 0.5773967504501343, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 306.375, + "completions/mean_terminated_length": 306.375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.05929776174817015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.02001808863133192, + "learning_rate": 7.7768e-06, + "loss": 0.0595, + "num_tokens": 25870590.0, + "reward": 3.9636728763580322, + "reward_std": 0.2054968923330307, + "rewards/reward_fn/mean": 3.9636728763580322, + "rewards/reward_fn/std": 0.2054968625307083, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 199.78125, + "completions/mean_terminated_length": 199.78125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.059403840033945055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.022585346596315503, + "learning_rate": 7.776399999999999e-06, + "loss": 0.0009, + "num_tokens": 25911831.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.059509918319719954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.029487166553735733, + "learning_rate": 7.776e-06, + "loss": 0.0012, + "num_tokens": 25958847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 212.40625, + "completions/mean_terminated_length": 212.40625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.05961599660549485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.028197331819683313, + "learning_rate": 7.775599999999999e-06, + "loss": 0.0011, + "num_tokens": 26001964.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1773.0, + "completions/max_terminated_length": 1773.0, + "completions/mean_length": 312.15625, + "completions/mean_terminated_length": 312.15625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.05972207489126976, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.030213134363293648, + "learning_rate": 7.7752e-06, + "loss": -0.1456, + "num_tokens": 26043633.0, + "reward": 3.2193734645843506, + "reward_std": 0.7501698732376099, + "rewards/reward_fn/mean": 3.2193734645843506, + "rewards/reward_fn/std": 0.7501698136329651, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 234.03125, + "completions/mean_terminated_length": 234.03125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.05982815317704466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0262491125613451, + "learning_rate": 7.774799999999999e-06, + "loss": -0.0273, + "num_tokens": 26075090.0, + "reward": 3.796114921569824, + "reward_std": 0.5917590260505676, + "rewards/reward_fn/mean": 3.796114921569824, + "rewards/reward_fn/std": 0.5917590260505676, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 260.59375, + "completions/mean_terminated_length": 260.59375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.05993423146281956, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.02379171922802925, + "learning_rate": 7.7744e-06, + "loss": 0.0414, + "num_tokens": 26121669.0, + "reward": 3.966832160949707, + "reward_std": 0.18762588500976562, + "rewards/reward_fn/mean": 3.966832160949707, + "rewards/reward_fn/std": 0.18762588500976562, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 340.25, + "completions/mean_terminated_length": 340.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.060040309748594464, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.023976900847628713, + "learning_rate": 7.774e-06, + "loss": 0.1362, + "num_tokens": 26176941.0, + "reward": 3.870932102203369, + "reward_std": 0.3512914180755615, + "rewards/reward_fn/mean": 3.870932102203369, + "rewards/reward_fn/std": 0.3512914478778839, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 401.3125, + "completions/mean_terminated_length": 348.19354248046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.060146388034369364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02290424262173474, + "learning_rate": 7.7736e-06, + "loss": 0.3008, + "num_tokens": 26230135.0, + "reward": 2.7015156745910645, + "reward_std": 0.5482509732246399, + "rewards/reward_fn/mean": 2.7015156745910645, + "rewards/reward_fn/std": 0.5482509732246399, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 386.40625, + "completions/mean_terminated_length": 386.40625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.06025246632014427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.022990726167336106, + "learning_rate": 7.7732e-06, + "loss": 0.0198, + "num_tokens": 26281284.0, + "reward": 3.185511589050293, + "reward_std": 0.48132088780403137, + "rewards/reward_fn/mean": 3.185511589050293, + "rewards/reward_fn/std": 0.4813208281993866, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.06035854460591917, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.02692016214132309, + "learning_rate": 7.7728e-06, + "loss": -0.0111, + "num_tokens": 26331024.0, + "reward": 3.2124645709991455, + "reward_std": 1.0723460912704468, + "rewards/reward_fn/mean": 3.2124645709991455, + "rewards/reward_fn/std": 1.0723460912704468, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 291.9375, + "completions/mean_terminated_length": 291.9375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06046462289169407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.016868799226358533, + "learning_rate": 7.7724e-06, + "loss": 0.0007, + "num_tokens": 26383566.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1654.0, + "completions/max_terminated_length": 1654.0, + "completions/mean_length": 541.6875, + "completions/mean_terminated_length": 541.6875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.060570701177468975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.021761097013950348, + "learning_rate": 7.772e-06, + "loss": 0.0355, + "num_tokens": 26440228.0, + "reward": 2.8173985481262207, + "reward_std": 0.04130704700946808, + "rewards/reward_fn/mean": 2.8173985481262207, + "rewards/reward_fn/std": 0.04130704328417778, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 743.0625, + "completions/mean_terminated_length": 656.0667114257812, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.060676779463243874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.02275507105514407, + "learning_rate": 7.7716e-06, + "loss": 0.2517, + "num_tokens": 26500774.0, + "reward": 2.5745859146118164, + "reward_std": 0.7353212237358093, + "rewards/reward_fn/mean": 2.5745859146118164, + "rewards/reward_fn/std": 0.7353212237358093, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 131.0, + "completions/mean_terminated_length": 131.0, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.06078285774901877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.0287727911490947, + "learning_rate": 7.7712e-06, + "loss": 0.0012, + "num_tokens": 26539238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 416.3125, + "completions/mean_terminated_length": 416.3125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.06088893603479368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.01945284497924149, + "learning_rate": 7.7708e-06, + "loss": 0.0562, + "num_tokens": 26596496.0, + "reward": 2.86415958404541, + "reward_std": 0.027712536975741386, + "rewards/reward_fn/mean": 2.86415958404541, + "rewards/reward_fn/std": 0.027712490409612656, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 234.21875, + "completions/mean_terminated_length": 234.21875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.06099501432056858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.02484210953116417, + "learning_rate": 7.7704e-06, + "loss": 0.001, + "num_tokens": 26638839.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 148.0625, + "completions/mean_terminated_length": 148.0625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.06110109260634348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.02255557058379054, + "learning_rate": 7.769999999999998e-06, + "loss": 0.0009, + "num_tokens": 26686265.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 245.84375, + "completions/mean_terminated_length": 245.84375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.061207170892118384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.02338914154097438, + "learning_rate": 7.7696e-06, + "loss": 0.0526, + "num_tokens": 26739572.0, + "reward": 3.0910239219665527, + "reward_std": 0.6024491190910339, + "rewards/reward_fn/mean": 3.0910239219665527, + "rewards/reward_fn/std": 0.6024490594863892, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 286.59375, + "completions/mean_terminated_length": 286.59375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.061313249177893284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.024446885101497173, + "learning_rate": 7.7692e-06, + "loss": 0.0939, + "num_tokens": 26780743.0, + "reward": 3.1179909706115723, + "reward_std": 0.3886381685733795, + "rewards/reward_fn/mean": 3.1179909706115723, + "rewards/reward_fn/std": 0.38863810896873474, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 163.4375, + "completions/mean_terminated_length": 163.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.06141932746366819, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.03373258630745113, + "learning_rate": 7.7688e-06, + "loss": 0.0155, + "num_tokens": 26818709.0, + "reward": 3.9367802143096924, + "reward_std": 0.24996694922447205, + "rewards/reward_fn/mean": 3.9367802143096924, + "rewards/reward_fn/std": 0.24996691942214966, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 298.34375, + "completions/mean_terminated_length": 298.34375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.06152540574944309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.025001312140375376, + "learning_rate": 7.7684e-06, + "loss": 0.0323, + "num_tokens": 26876544.0, + "reward": 2.8969321250915527, + "reward_std": 0.41512611508369446, + "rewards/reward_fn/mean": 2.8969321250915527, + "rewards/reward_fn/std": 0.41512614488601685, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 207.96875, + "completions/mean_terminated_length": 207.96875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.06163148403521799, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.02601949847303331, + "learning_rate": 7.767999999999999e-06, + "loss": 0.0264, + "num_tokens": 26914719.0, + "reward": 3.91837477684021, + "reward_std": 0.3212246596813202, + "rewards/reward_fn/mean": 3.91837477684021, + "rewards/reward_fn/std": 0.3212246298789978, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 251.75, + "completions/mean_terminated_length": 251.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.061737562320992895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.02899379818700254, + "learning_rate": 7.7676e-06, + "loss": 0.0021, + "num_tokens": 26951543.0, + "reward": 1.9556413888931274, + "reward_std": 0.4255604147911072, + "rewards/reward_fn/mean": 1.9556413888931274, + "rewards/reward_fn/std": 0.4255603849887848, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 161.90625, + "completions/mean_terminated_length": 161.90625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.061843640606767794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.028046605177223682, + "learning_rate": 7.767199999999999e-06, + "loss": 0.0011, + "num_tokens": 26986676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 329.0625, + "completions/mean_terminated_length": 329.0625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06194971889254269, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.02596241678111255, + "learning_rate": 7.7668e-06, + "loss": 0.0319, + "num_tokens": 27038422.0, + "reward": 2.7040181159973145, + "reward_std": 0.4574950635433197, + "rewards/reward_fn/mean": 2.7040181159973145, + "rewards/reward_fn/std": 0.4574950933456421, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 310.34375, + "completions/mean_terminated_length": 310.34375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.0620557971783176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.02197631192393601, + "learning_rate": 7.766399999999999e-06, + "loss": -0.0528, + "num_tokens": 27092321.0, + "reward": 3.9704360961914062, + "reward_std": 0.16723935306072235, + "rewards/reward_fn/mean": 3.9704360961914062, + "rewards/reward_fn/std": 0.16723932325839996, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1077.0, + "completions/max_terminated_length": 1077.0, + "completions/mean_length": 256.3125, + "completions/mean_terminated_length": 256.3125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.0621618754640925, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.025298673193901777, + "learning_rate": 7.766e-06, + "loss": 0.034, + "num_tokens": 27120971.0, + "reward": 3.0783987045288086, + "reward_std": 0.49813732504844666, + "rewards/reward_fn/mean": 3.0783987045288086, + "rewards/reward_fn/std": 0.49813729524612427, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 472.0625, + "completions/mean_terminated_length": 472.0625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.062267953749867405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.022866470273584127, + "learning_rate": 7.765599999999999e-06, + "loss": -0.0309, + "num_tokens": 27166509.0, + "reward": 2.5862374305725098, + "reward_std": 0.24781370162963867, + "rewards/reward_fn/mean": 2.5862374305725098, + "rewards/reward_fn/std": 0.24781371653079987, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 334.09375, + "completions/mean_terminated_length": 334.09375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.062374032035642304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.024537658086046576, + "learning_rate": 7.7652e-06, + "loss": 0.0017, + "num_tokens": 27215664.0, + "reward": 3.96297287940979, + "reward_std": 0.2094566822052002, + "rewards/reward_fn/mean": 3.96297287940979, + "rewards/reward_fn/std": 0.2094566524028778, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 236.375, + "completions/mean_terminated_length": 236.375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.062480110321417204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.05280748289078474, + "learning_rate": 7.7648e-06, + "loss": 0.0752, + "num_tokens": 27262844.0, + "reward": 3.666501998901367, + "reward_std": 0.5441361665725708, + "rewards/reward_fn/mean": 3.666501998901367, + "rewards/reward_fn/std": 0.544136106967926, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.0625861886071921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.029549932572990656, + "learning_rate": 7.7644e-06, + "loss": 0.0012, + "num_tokens": 27303352.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 242.8125, + "completions/mean_terminated_length": 242.8125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.06269226689296702, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.026474663987755775, + "learning_rate": 7.764e-06, + "loss": 0.0806, + "num_tokens": 27345042.0, + "reward": 3.7683963775634766, + "reward_std": 0.4177395701408386, + "rewards/reward_fn/mean": 3.7683963775634766, + "rewards/reward_fn/std": 0.417739599943161, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 192.46875, + "completions/mean_terminated_length": 192.46875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.06279834517874192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.02718471735715866, + "learning_rate": 7.7636e-06, + "loss": 0.0011, + "num_tokens": 27389153.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 217.40625, + "completions/mean_terminated_length": 217.40625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06290442346451681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.02357006026431918, + "learning_rate": 7.7632e-06, + "loss": 0.0768, + "num_tokens": 27444366.0, + "reward": 3.6223254203796387, + "reward_std": 0.6039735674858093, + "rewards/reward_fn/mean": 3.6223254203796387, + "rewards/reward_fn/std": 0.6039735674858093, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 224.1875, + "completions/mean_terminated_length": 224.1875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.06301050175029171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.032485876930877566, + "learning_rate": 7.7628e-06, + "loss": 0.0013, + "num_tokens": 27489652.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 370.84375, + "completions/mean_terminated_length": 316.7419128417969, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.06311658003606661, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.021101591642946005, + "learning_rate": 7.7624e-06, + "loss": 0.3438, + "num_tokens": 27522351.0, + "reward": 3.6290016174316406, + "reward_std": 0.8651793599128723, + "rewards/reward_fn/mean": 3.6290016174316406, + "rewards/reward_fn/std": 0.8651794195175171, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 461.3125, + "completions/mean_terminated_length": 461.3125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.06322265832184151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.020340920658782125, + "learning_rate": 7.762e-06, + "loss": 0.1006, + "num_tokens": 27556665.0, + "reward": 2.6203527450561523, + "reward_std": 0.41824400424957275, + "rewards/reward_fn/mean": 2.6203527450561523, + "rewards/reward_fn/std": 0.41824400424957275, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 362.3125, + "completions/mean_terminated_length": 362.3125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.06332873660761643, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.031615799525752664, + "learning_rate": 7.761599999999999e-06, + "loss": 0.1534, + "num_tokens": 27603619.0, + "reward": 3.4667482376098633, + "reward_std": 0.7769173979759216, + "rewards/reward_fn/mean": 3.4667482376098633, + "rewards/reward_fn/std": 0.7769173979759216, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 298.5, + "completions/mean_terminated_length": 298.5, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.06343481489339133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.01625680283177644, + "learning_rate": 7.7612e-06, + "loss": 0.0007, + "num_tokens": 27658291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 368.125, + "completions/mean_terminated_length": 368.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.06354089317916622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.022277246927842498, + "learning_rate": 7.760799999999999e-06, + "loss": 0.1759, + "num_tokens": 27705591.0, + "reward": 2.9454853534698486, + "reward_std": 1.09701406955719, + "rewards/reward_fn/mean": 2.9454853534698486, + "rewards/reward_fn/std": 1.0970139503479004, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 572.25, + "completions/mean_terminated_length": 572.25, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.06364697146494112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.01797162415459752, + "learning_rate": 7.7604e-06, + "loss": 0.0005, + "num_tokens": 27773599.0, + "reward": 2.691895008087158, + "reward_std": 0.3327391445636749, + "rewards/reward_fn/mean": 2.691895008087158, + "rewards/reward_fn/std": 0.33273911476135254, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1965.0, + "completions/max_terminated_length": 1965.0, + "completions/mean_length": 387.9375, + "completions/mean_terminated_length": 387.9375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.06375304975071602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.01835179328918457, + "learning_rate": 7.76e-06, + "loss": -0.1032, + "num_tokens": 27830749.0, + "reward": 2.8080360889434814, + "reward_std": 0.028673529624938965, + "rewards/reward_fn/mean": 2.8080360889434814, + "rewards/reward_fn/std": 0.02867353893816471, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 308.03125, + "completions/mean_terminated_length": 308.03125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.06385912803649094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.020648242440074682, + "learning_rate": 7.7596e-06, + "loss": 0.0008, + "num_tokens": 27880414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 640.15625, + "completions/mean_terminated_length": 546.300048828125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.06396520632226584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.023265723371878266, + "learning_rate": 7.7592e-06, + "loss": 0.0666, + "num_tokens": 27935683.0, + "reward": 2.6192939281463623, + "reward_std": 0.8203719258308411, + "rewards/reward_fn/mean": 2.6192939281463623, + "rewards/reward_fn/std": 0.8203719258308411, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 339.875, + "completions/mean_terminated_length": 339.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.06407128460804073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.027694092597812414, + "learning_rate": 7.7588e-06, + "loss": -0.0216, + "num_tokens": 27960351.0, + "reward": 3.7123684883117676, + "reward_std": 0.5121208429336548, + "rewards/reward_fn/mean": 3.7123684883117676, + "rewards/reward_fn/std": 0.5121208429336548, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 222.34375, + "completions/mean_terminated_length": 222.34375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.06417736289381563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.02771115326322615, + "learning_rate": 7.7584e-06, + "loss": -0.0306, + "num_tokens": 28002954.0, + "reward": 3.9631314277648926, + "reward_std": 0.2085607498884201, + "rewards/reward_fn/mean": 3.9631314277648926, + "rewards/reward_fn/std": 0.2085607349872589, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1230.0, + "completions/max_terminated_length": 1230.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.06428344117959053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02708641323260963, + "learning_rate": 7.758e-06, + "loss": 0.0956, + "num_tokens": 28045898.0, + "reward": 2.7624735832214355, + "reward_std": 0.34915468096733093, + "rewards/reward_fn/mean": 2.7624735832214355, + "rewards/reward_fn/std": 0.34915465116500854, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 344.78125, + "completions/mean_terminated_length": 344.78125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06438951946536543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.01575366989709437, + "learning_rate": 7.7576e-06, + "loss": -0.054, + "num_tokens": 28096035.0, + "reward": 3.480724811553955, + "reward_std": 0.637944221496582, + "rewards/reward_fn/mean": 3.480724811553955, + "rewards/reward_fn/std": 0.6379441618919373, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 303.4375, + "completions/mean_terminated_length": 303.4375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.06449559775114035, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.019724218058399856, + "learning_rate": 7.7572e-06, + "loss": 0.1051, + "num_tokens": 28140113.0, + "reward": 3.8815999031066895, + "reward_std": 0.32930222153663635, + "rewards/reward_fn/mean": 3.8815999031066895, + "rewards/reward_fn/std": 0.32930222153663635, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1501.0, + "completions/max_terminated_length": 1501.0, + "completions/mean_length": 279.59375, + "completions/mean_terminated_length": 279.59375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.06460167603691525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.023657136596739292, + "learning_rate": 7.7568e-06, + "loss": 0.0655, + "num_tokens": 28179076.0, + "reward": 3.6588215827941895, + "reward_std": 0.5550169944763184, + "rewards/reward_fn/mean": 3.6588215827941895, + "rewards/reward_fn/std": 0.5550169944763184, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 315.9375, + "completions/mean_terminated_length": 315.9375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.06470775432269014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.020349433412775397, + "learning_rate": 7.756399999999999e-06, + "loss": 0.0056, + "num_tokens": 28236546.0, + "reward": 1.650451898574829, + "reward_std": 0.03633672744035721, + "rewards/reward_fn/mean": 1.650451898574829, + "rewards/reward_fn/std": 0.0363367535173893, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 203.28125, + "completions/mean_terminated_length": 203.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.06481383260846504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.01983096171170473, + "learning_rate": 7.756e-06, + "loss": 0.0008, + "num_tokens": 28277451.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 290.1875, + "completions/mean_terminated_length": 290.1875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.06491991089423994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.023188997758552432, + "learning_rate": 7.7556e-06, + "loss": -0.0779, + "num_tokens": 28319761.0, + "reward": 3.2103209495544434, + "reward_std": 0.23490554094314575, + "rewards/reward_fn/mean": 3.2103209495544434, + "rewards/reward_fn/std": 0.23490552604198456, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1524.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.06502598918001486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.023811078863218427, + "learning_rate": 7.7552e-06, + "loss": -0.0893, + "num_tokens": 28367543.0, + "reward": 3.9686954021453857, + "reward_std": 0.17708587646484375, + "rewards/reward_fn/mean": 3.9686954021453857, + "rewards/reward_fn/std": 0.17708587646484375, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 220.09375, + "completions/mean_terminated_length": 220.09375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.06513206746578976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.024617396760731936, + "learning_rate": 7.7548e-06, + "loss": 0.0609, + "num_tokens": 28420698.0, + "reward": 3.9674062728881836, + "reward_std": 0.1843782216310501, + "rewards/reward_fn/mean": 3.9674062728881836, + "rewards/reward_fn/std": 0.1843782216310501, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 1238.0, + "completions/mean_length": 224.09375, + "completions/mean_terminated_length": 224.09375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.06523814575156466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.0253062816336751, + "learning_rate": 7.7544e-06, + "loss": 0.001, + "num_tokens": 28457085.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 217.34375, + "completions/mean_terminated_length": 217.34375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06534422403733955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.02408299339003861, + "learning_rate": 7.753999999999999e-06, + "loss": 0.0, + "num_tokens": 28502504.0, + "reward": 3.666853666305542, + "reward_std": 0.7510157823562622, + "rewards/reward_fn/mean": 3.666853666305542, + "rewards/reward_fn/std": 0.7510157823562622, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 194.34375, + "completions/mean_terminated_length": 194.34375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.06545030232311445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.01842890668194741, + "learning_rate": 7.7536e-06, + "loss": 0.0007, + "num_tokens": 28560915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1078.65625, + "completions/mean_terminated_length": 978.3793334960938, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.06555638060888937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.013798431027680635, + "learning_rate": 7.753199999999999e-06, + "loss": 0.1245, + "num_tokens": 28631080.0, + "reward": 2.1499953269958496, + "reward_std": 0.8221304416656494, + "rewards/reward_fn/mean": 2.1499953269958496, + "rewards/reward_fn/std": 0.8221304416656494, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 279.21875, + "completions/mean_terminated_length": 279.21875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.06566245889466427, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.03788356087170541, + "learning_rate": 7.7528e-06, + "loss": 0.1088, + "num_tokens": 28659407.0, + "reward": 2.988480567932129, + "reward_std": 0.23437552154064178, + "rewards/reward_fn/mean": 2.988480567932129, + "rewards/reward_fn/std": 0.23437556624412537, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 268.71875, + "completions/mean_terminated_length": 268.71875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.06576853718043917, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.022709642769768834, + "learning_rate": 7.752399999999999e-06, + "loss": 0.1208, + "num_tokens": 28697734.0, + "reward": 3.431100606918335, + "reward_std": 0.6189658045768738, + "rewards/reward_fn/mean": 3.431100606918335, + "rewards/reward_fn/std": 0.6189658641815186, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 283.0625, + "completions/mean_terminated_length": 283.0625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.06587461546621406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.02149501978419721, + "learning_rate": 7.752e-06, + "loss": 0.0237, + "num_tokens": 28752904.0, + "reward": 1.8805391788482666, + "reward_std": 0.6935924291610718, + "rewards/reward_fn/mean": 1.8805391788482666, + "rewards/reward_fn/std": 0.6935924291610718, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.06598069375198896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.01584722870029509, + "learning_rate": 7.751599999999999e-06, + "loss": -0.006, + "num_tokens": 28805388.0, + "reward": 2.8024349212646484, + "reward_std": 0.36229246854782104, + "rewards/reward_fn/mean": 2.8024349212646484, + "rewards/reward_fn/std": 0.36229249835014343, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 410.21875, + "completions/mean_terminated_length": 410.21875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.06608677203776386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.020291926339268684, + "learning_rate": 7.7512e-06, + "loss": -0.0774, + "num_tokens": 28855475.0, + "reward": 3.961885690689087, + "reward_std": 0.2156069576740265, + "rewards/reward_fn/mean": 3.961885690689087, + "rewards/reward_fn/std": 0.2156069576740265, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 223.03125, + "completions/mean_terminated_length": 223.03125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.06619285032353878, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.024670890532433987, + "learning_rate": 7.7508e-06, + "loss": 0.1838, + "num_tokens": 28909012.0, + "reward": 3.6656932830810547, + "reward_std": 0.5034979581832886, + "rewards/reward_fn/mean": 3.6656932830810547, + "rewards/reward_fn/std": 0.5034979581832886, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 375.15625, + "completions/mean_terminated_length": 375.15625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.06629892860931368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.02088888338766992, + "learning_rate": 7.7504e-06, + "loss": -0.0578, + "num_tokens": 28951257.0, + "reward": 2.5856075286865234, + "reward_std": 0.31954845786094666, + "rewards/reward_fn/mean": 2.5856075286865234, + "rewards/reward_fn/std": 0.31954848766326904, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 265.90625, + "completions/mean_terminated_length": 265.90625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.06640500689508858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01428930729161948, + "learning_rate": 7.75e-06, + "loss": 0.0585, + "num_tokens": 28981846.0, + "reward": 3.45589542388916, + "reward_std": 0.6938397288322449, + "rewards/reward_fn/mean": 3.45589542388916, + "rewards/reward_fn/std": 0.6938397288322449, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 257.3125, + "completions/mean_terminated_length": 257.3125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.06651108518086347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.016050009289756417, + "learning_rate": 7.7496e-06, + "loss": 0.0006, + "num_tokens": 29035904.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 115.21875, + "completions/mean_terminated_length": 115.21875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.06661716346663837, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.02869252348318696, + "learning_rate": 7.7492e-06, + "loss": -0.0297, + "num_tokens": 29077415.0, + "reward": 3.737992763519287, + "reward_std": 0.5032368302345276, + "rewards/reward_fn/mean": 3.737992763519287, + "rewards/reward_fn/std": 0.5032367706298828, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 266.6875, + "completions/mean_terminated_length": 266.6875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.06672324175241329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01821572007611394, + "learning_rate": 7.7488e-06, + "loss": 0.0338, + "num_tokens": 29125021.0, + "reward": 2.8474972248077393, + "reward_std": 0.2903161644935608, + "rewards/reward_fn/mean": 2.8474972248077393, + "rewards/reward_fn/std": 0.2903161942958832, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 201.375, + "completions/mean_terminated_length": 201.375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.06682932003818819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.02066903980448842, + "learning_rate": 7.7484e-06, + "loss": 0.0008, + "num_tokens": 29171081.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 157.15625, + "completions/mean_terminated_length": 157.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.06693539832396309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.018408002564683557, + "learning_rate": 7.748e-06, + "loss": 0.0007, + "num_tokens": 29208398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1817.0, + "completions/max_terminated_length": 1817.0, + "completions/mean_length": 382.125, + "completions/mean_terminated_length": 382.125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.06704147660973798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.016226678737439215, + "learning_rate": 7.7476e-06, + "loss": 0.1208, + "num_tokens": 29263634.0, + "reward": 2.820263385772705, + "reward_std": 0.028630422428250313, + "rewards/reward_fn/mean": 2.820263385772705, + "rewards/reward_fn/std": 0.02863038145005703, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1256.0, + "completions/max_terminated_length": 1256.0, + "completions/mean_length": 324.71875, + "completions/mean_terminated_length": 324.71875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.06714755489551288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.02659795875661075, + "learning_rate": 7.7472e-06, + "loss": -0.0069, + "num_tokens": 29305705.0, + "reward": 3.5446953773498535, + "reward_std": 0.7270760536193848, + "rewards/reward_fn/mean": 3.5446953773498535, + "rewards/reward_fn/std": 0.7270760536193848, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 107.5625, + "completions/mean_terminated_length": 107.5625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.06725363318128778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "kl": 0.026664254954084754, + "learning_rate": 7.7468e-06, + "loss": 0.0563, + "num_tokens": 29356571.0, + "reward": 2.822035551071167, + "reward_std": 0.03083919733762741, + "rewards/reward_fn/mean": 2.822035551071167, + "rewards/reward_fn/std": 0.030839232727885246, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 171.3125, + "completions/mean_terminated_length": 171.3125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.0673597114670627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.02076897514052689, + "learning_rate": 7.7464e-06, + "loss": 0.0008, + "num_tokens": 29417093.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1839.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 315.9375, + "completions/mean_terminated_length": 315.9375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.0674657897528376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.020625132601708174, + "learning_rate": 7.746e-06, + "loss": -0.0231, + "num_tokens": 29469347.0, + "reward": 3.4893736839294434, + "reward_std": 0.8306846618652344, + "rewards/reward_fn/mean": 3.4893736839294434, + "rewards/reward_fn/std": 0.8306846618652344, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1967.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 430.0625, + "completions/mean_terminated_length": 430.0625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.0675718680386125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.02193266712129116, + "learning_rate": 7.7456e-06, + "loss": 0.199, + "num_tokens": 29521989.0, + "reward": 3.3931772708892822, + "reward_std": 0.9524803757667542, + "rewards/reward_fn/mean": 3.3931772708892822, + "rewards/reward_fn/std": 0.9524803757667542, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 218.59375, + "completions/mean_terminated_length": 218.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.0676779463243874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.01927727018482983, + "learning_rate": 7.7452e-06, + "loss": 0.0266, + "num_tokens": 29557464.0, + "reward": 3.8538365364074707, + "reward_std": 0.3451085090637207, + "rewards/reward_fn/mean": 3.8538365364074707, + "rewards/reward_fn/std": 0.3451085090637207, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.0677840246101623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.023490537889301777, + "learning_rate": 7.744799999999999e-06, + "loss": 0.0009, + "num_tokens": 29601612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 169.15625, + "completions/mean_terminated_length": 169.15625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.06789010289593721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.022257780889049172, + "learning_rate": 7.7444e-06, + "loss": 0.0009, + "num_tokens": 29650641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 170.46875, + "completions/mean_terminated_length": 170.46875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.0679961811817121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.01819098659325391, + "learning_rate": 7.743999999999999e-06, + "loss": 0.0007, + "num_tokens": 29711392.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1487.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 571.28125, + "completions/mean_terminated_length": 571.28125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.068102259467487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.014475560979917645, + "learning_rate": 7.7436e-06, + "loss": 0.0193, + "num_tokens": 29767849.0, + "reward": 2.6820812225341797, + "reward_std": 0.33843210339546204, + "rewards/reward_fn/mean": 2.6820812225341797, + "rewards/reward_fn/std": 0.3384321331977844, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1318.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 672.3125, + "completions/mean_terminated_length": 672.3125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.0682083377532619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.017669666092842817, + "learning_rate": 7.743199999999999e-06, + "loss": -0.0573, + "num_tokens": 29826579.0, + "reward": 2.787215232849121, + "reward_std": 0.32044708728790283, + "rewards/reward_fn/mean": 2.787215232849121, + "rewards/reward_fn/std": 0.3204471170902252, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 122.84375, + "completions/mean_terminated_length": 122.84375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0683144160390368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.019987852778285742, + "learning_rate": 7.7428e-06, + "loss": 0.0008, + "num_tokens": 29863278.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.06842049432481172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.026723440503701568, + "learning_rate": 7.742399999999999e-06, + "loss": -0.045, + "num_tokens": 29911210.0, + "reward": 3.9678614139556885, + "reward_std": 0.18180328607559204, + "rewards/reward_fn/mean": 3.9678614139556885, + "rewards/reward_fn/std": 0.18180328607559204, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 346.90625, + "completions/mean_terminated_length": 346.90625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.06852657261058662, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.018849076703190804, + "learning_rate": 7.742e-06, + "loss": -0.017, + "num_tokens": 29950791.0, + "reward": 3.6496143341064453, + "reward_std": 0.6005396842956543, + "rewards/reward_fn/mean": 3.6496143341064453, + "rewards/reward_fn/std": 0.6005396842956543, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 181.40625, + "completions/mean_terminated_length": 181.40625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.06863265089636152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.01960382249671966, + "learning_rate": 7.741599999999999e-06, + "loss": 0.0008, + "num_tokens": 29989428.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 381.21875, + "completions/mean_terminated_length": 381.21875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.06873872918213642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02327621285803616, + "learning_rate": 7.7412e-06, + "loss": 0.1426, + "num_tokens": 30037851.0, + "reward": 3.4980902671813965, + "reward_std": 0.7049679756164551, + "rewards/reward_fn/mean": 3.4980902671813965, + "rewards/reward_fn/std": 0.7049679160118103, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 299.90625, + "completions/mean_terminated_length": 299.90625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.06884480746791131, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.018094201455824077, + "learning_rate": 7.7408e-06, + "loss": -0.0152, + "num_tokens": 30095672.0, + "reward": 3.6386489868164062, + "reward_std": 0.8532451391220093, + "rewards/reward_fn/mean": 3.6386489868164062, + "rewards/reward_fn/std": 0.8532451391220093, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 153.125, + "completions/mean_terminated_length": 153.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.06895088575368621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.023316435981541872, + "learning_rate": 7.7404e-06, + "loss": 0.0009, + "num_tokens": 30131132.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 145.53125, + "completions/mean_terminated_length": 145.53125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.06905696403946113, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.026125010568648577, + "learning_rate": 7.74e-06, + "loss": -0.0946, + "num_tokens": 30154061.0, + "reward": 3.0254387855529785, + "reward_std": 0.07992041856050491, + "rewards/reward_fn/mean": 3.0254387855529785, + "rewards/reward_fn/std": 0.07992040365934372, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 458.0625, + "completions/mean_terminated_length": 458.0625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.06916304232523603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.01891616778448224, + "learning_rate": 7.7396e-06, + "loss": 0.0773, + "num_tokens": 30224079.0, + "reward": 2.673313617706299, + "reward_std": 0.30429506301879883, + "rewards/reward_fn/mean": 2.673313617706299, + "rewards/reward_fn/std": 0.30429503321647644, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 351.46875, + "completions/mean_terminated_length": 351.46875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.06926912061101093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.023535766871646047, + "learning_rate": 7.7392e-06, + "loss": -0.027, + "num_tokens": 30265534.0, + "reward": 3.135115623474121, + "reward_std": 0.47822919487953186, + "rewards/reward_fn/mean": 3.135115623474121, + "rewards/reward_fn/std": 0.47822922468185425, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 461.40625, + "completions/mean_terminated_length": 410.2257995605469, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.06937519889678583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.018667538883164525, + "learning_rate": 7.7388e-06, + "loss": 0.1329, + "num_tokens": 30315243.0, + "reward": 3.3491392135620117, + "reward_std": 1.0377370119094849, + "rewards/reward_fn/mean": 3.3491392135620117, + "rewards/reward_fn/std": 1.0377370119094849, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1697.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 685.59375, + "completions/mean_terminated_length": 685.59375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.06948127718256072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.01727759197819978, + "learning_rate": 7.7384e-06, + "loss": 0.0192, + "num_tokens": 30373502.0, + "reward": 3.753196954727173, + "reward_std": 0.5557073950767517, + "rewards/reward_fn/mean": 3.753196954727173, + "rewards/reward_fn/std": 0.5557073950767517, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 181.21875, + "completions/mean_terminated_length": 181.21875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.06958735546833564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.018722419743426144, + "learning_rate": 7.738e-06, + "loss": 0.1536, + "num_tokens": 30412837.0, + "reward": 2.847813606262207, + "reward_std": 0.032345082610845566, + "rewards/reward_fn/mean": 2.847813606262207, + "rewards/reward_fn/std": 0.03234507888555527, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 343.125, + "completions/mean_terminated_length": 343.125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.06969343375411054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.01867408840917051, + "learning_rate": 7.737599999999999e-06, + "loss": 0.0086, + "num_tokens": 30456777.0, + "reward": 2.8930726051330566, + "reward_std": 0.20554324984550476, + "rewards/reward_fn/mean": 2.8930726051330566, + "rewards/reward_fn/std": 0.20554324984550476, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1573.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 438.90625, + "completions/mean_terminated_length": 438.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.06979951203988544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.013406037352979183, + "learning_rate": 7.7372e-06, + "loss": 0.0336, + "num_tokens": 30491014.0, + "reward": 2.6613709926605225, + "reward_std": 0.055673278868198395, + "rewards/reward_fn/mean": 2.6613709926605225, + "rewards/reward_fn/std": 0.055673304945230484, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 352.9375, + "completions/mean_terminated_length": 352.9375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.06990559032566034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.02186316321603954, + "learning_rate": 7.736799999999998e-06, + "loss": 0.0747, + "num_tokens": 30519428.0, + "reward": 3.3577191829681396, + "reward_std": 0.6026961803436279, + "rewards/reward_fn/mean": 3.3577191829681396, + "rewards/reward_fn/std": 0.6026961803436279, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 477.875, + "completions/mean_terminated_length": 477.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.07001166861143523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.018300026771612465, + "learning_rate": 7.7364e-06, + "loss": -0.0899, + "num_tokens": 30571616.0, + "reward": 3.9199166297912598, + "reward_std": 0.3151904344558716, + "rewards/reward_fn/mean": 3.9199166297912598, + "rewards/reward_fn/std": 0.3151904046535492, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 323.34375, + "completions/mean_terminated_length": 323.34375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.07011774689721013, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.022360553964972496, + "learning_rate": 7.736e-06, + "loss": -0.0118, + "num_tokens": 30612235.0, + "reward": 2.710085391998291, + "reward_std": 0.1893293261528015, + "rewards/reward_fn/mean": 2.710085391998291, + "rewards/reward_fn/std": 0.1893293410539627, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 146.6875, + "completions/mean_terminated_length": 146.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.07022382518298505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.021579281659796834, + "learning_rate": 7.7356e-06, + "loss": 0.0009, + "num_tokens": 30645281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 137.09375, + "completions/mean_terminated_length": 137.09375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.07032990346875995, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.02280471404083073, + "learning_rate": 7.7352e-06, + "loss": 0.0127, + "num_tokens": 30693796.0, + "reward": 3.1744942665100098, + "reward_std": 0.06619588285684586, + "rewards/reward_fn/mean": 3.1744942665100098, + "rewards/reward_fn/std": 0.06619583815336227, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1954.0, + "completions/max_terminated_length": 1954.0, + "completions/mean_length": 472.78125, + "completions/mean_terminated_length": 472.78125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07043598175453485, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.019758876063860953, + "learning_rate": 7.7348e-06, + "loss": 0.1493, + "num_tokens": 30727741.0, + "reward": 3.272916316986084, + "reward_std": 0.8355657458305359, + "rewards/reward_fn/mean": 3.272916316986084, + "rewards/reward_fn/std": 0.8355657458305359, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 184.9375, + "completions/mean_terminated_length": 184.9375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.07054206004030975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.020734936697408557, + "learning_rate": 7.7344e-06, + "loss": 0.0008, + "num_tokens": 30782395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 227.5625, + "completions/mean_terminated_length": 227.5625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.07064813832608464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.029407049994915724, + "learning_rate": 7.733999999999999e-06, + "loss": 0.0012, + "num_tokens": 30830157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 342.46875, + "completions/mean_terminated_length": 342.46875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.07075421661185956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.023776059737429023, + "learning_rate": 7.7336e-06, + "loss": 0.1323, + "num_tokens": 30857724.0, + "reward": 3.5412867069244385, + "reward_std": 0.5026692152023315, + "rewards/reward_fn/mean": 3.5412867069244385, + "rewards/reward_fn/std": 0.5026691555976868, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 213.90625, + "completions/mean_terminated_length": 213.90625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.07086029489763446, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.026352980406954885, + "learning_rate": 7.733199999999999e-06, + "loss": -0.1159, + "num_tokens": 30882425.0, + "reward": 3.824970245361328, + "reward_std": 0.41358694434165955, + "rewards/reward_fn/mean": 3.824970245361328, + "rewards/reward_fn/std": 0.41358694434165955, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 550.3125, + "completions/mean_terminated_length": 550.3125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.07096637318340936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.017197473789565265, + "learning_rate": 7.7328e-06, + "loss": -0.086, + "num_tokens": 30943363.0, + "reward": 3.852973699569702, + "reward_std": 0.4963712990283966, + "rewards/reward_fn/mean": 3.852973699569702, + "rewards/reward_fn/std": 0.4963712692260742, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 223.71875, + "completions/mean_terminated_length": 223.71875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.07107245146918426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.022278335178270936, + "learning_rate": 7.732399999999999e-06, + "loss": 0.0117, + "num_tokens": 30996954.0, + "reward": 3.4236133098602295, + "reward_std": 0.5188043713569641, + "rewards/reward_fn/mean": 3.4236133098602295, + "rewards/reward_fn/std": 0.5188043713569641, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1981.0, + "completions/mean_length": 473.53125, + "completions/mean_terminated_length": 422.7419128417969, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.07117852975495916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.027860619127750397, + "learning_rate": 7.732e-06, + "loss": 0.4003, + "num_tokens": 31042411.0, + "reward": 2.8622326850891113, + "reward_std": 0.21642757952213287, + "rewards/reward_fn/mean": 2.8622326850891113, + "rewards/reward_fn/std": 0.21642759442329407, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 1063.59375, + "completions/mean_terminated_length": 961.7586059570312, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.07128460804073407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.015642878832295537, + "learning_rate": 7.7316e-06, + "loss": 0.1365, + "num_tokens": 31111838.0, + "reward": 2.18656063079834, + "reward_std": 0.8373485207557678, + "rewards/reward_fn/mean": 2.18656063079834, + "rewards/reward_fn/std": 0.8373485207557678, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 370.34375, + "completions/mean_terminated_length": 370.34375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.07139068632650897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.017991895554587245, + "learning_rate": 7.7312e-06, + "loss": 0.0152, + "num_tokens": 31155849.0, + "reward": 3.9679219722747803, + "reward_std": 0.18146038055419922, + "rewards/reward_fn/mean": 3.9679219722747803, + "rewards/reward_fn/std": 0.18146035075187683, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 234.96875, + "completions/mean_terminated_length": 234.96875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.07149676461228387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.02120826207101345, + "learning_rate": 7.7308e-06, + "loss": 0.0147, + "num_tokens": 31203592.0, + "reward": 3.9284887313842773, + "reward_std": 0.4045286774635315, + "rewards/reward_fn/mean": 3.9284887313842773, + "rewards/reward_fn/std": 0.4045286476612091, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1212.0, + "completions/max_terminated_length": 1212.0, + "completions/mean_length": 323.625, + "completions/mean_terminated_length": 323.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.07160284289805877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.022802867693826556, + "learning_rate": 7.7304e-06, + "loss": -0.0868, + "num_tokens": 31253724.0, + "reward": 3.25797963142395, + "reward_std": 0.7901930809020996, + "rewards/reward_fn/mean": 3.25797963142395, + "rewards/reward_fn/std": 0.7901931405067444, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 302.8125, + "completions/mean_terminated_length": 302.8125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.07170892118383367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.014244599267840385, + "learning_rate": 7.73e-06, + "loss": -0.0201, + "num_tokens": 31296598.0, + "reward": 2.8737645149230957, + "reward_std": 0.03616030886769295, + "rewards/reward_fn/mean": 2.8737645149230957, + "rewards/reward_fn/std": 0.03616032004356384, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 601.28125, + "completions/mean_terminated_length": 554.6129150390625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.07181499946960856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01887105731293559, + "learning_rate": 7.7296e-06, + "loss": 0.1873, + "num_tokens": 31352735.0, + "reward": 2.5296192169189453, + "reward_std": 0.8162726759910583, + "rewards/reward_fn/mean": 2.5296192169189453, + "rewards/reward_fn/std": 0.8162726163864136, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 349.75, + "completions/mean_terminated_length": 349.75, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.07192107775538348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.024308583000674844, + "learning_rate": 7.729199999999999e-06, + "loss": 0.0147, + "num_tokens": 31393367.0, + "reward": 3.064675807952881, + "reward_std": 1.1492177248001099, + "rewards/reward_fn/mean": 3.064675807952881, + "rewards/reward_fn/std": 1.1492178440093994, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 190.21875, + "completions/mean_terminated_length": 190.21875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.07202715604115838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.02310982788912952, + "learning_rate": 7.7288e-06, + "loss": -0.0445, + "num_tokens": 31450398.0, + "reward": 3.965233325958252, + "reward_std": 0.1966707557439804, + "rewards/reward_fn/mean": 3.965233325958252, + "rewards/reward_fn/std": 0.1966707557439804, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.07213323432693328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.015066134044900537, + "learning_rate": 7.728399999999999e-06, + "loss": 0.087, + "num_tokens": 31503330.0, + "reward": 3.8951284885406494, + "reward_std": 0.4410572648048401, + "rewards/reward_fn/mean": 3.8951284885406494, + "rewards/reward_fn/std": 0.4410572946071625, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 253.625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.07223931261270818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.022291683591902256, + "learning_rate": 7.728e-06, + "loss": -0.0688, + "num_tokens": 31565494.0, + "reward": 3.6002941131591797, + "reward_std": 0.5299660563468933, + "rewards/reward_fn/mean": 3.6002941131591797, + "rewards/reward_fn/std": 0.5299659967422485, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 215.59375, + "completions/mean_terminated_length": 215.59375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07234539089848308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.02722454722970724, + "learning_rate": 7.727599999999999e-06, + "loss": 0.0011, + "num_tokens": 31621225.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 236.9375, + "completions/mean_terminated_length": 236.9375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.07245146918425799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0214088864158839, + "learning_rate": 7.7272e-06, + "loss": 0.0853, + "num_tokens": 31663399.0, + "reward": 3.0963635444641113, + "reward_std": 0.06334761530160904, + "rewards/reward_fn/mean": 3.0963635444641113, + "rewards/reward_fn/std": 0.06334759294986725, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 469.8064270019531, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.07255754747003289, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.023146436316892505, + "learning_rate": 7.7268e-06, + "loss": 0.2416, + "num_tokens": 31720203.0, + "reward": 2.755568742752075, + "reward_std": 0.5547494292259216, + "rewards/reward_fn/mean": 2.755568742752075, + "rewards/reward_fn/std": 0.5547494888305664, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1290.0, + "completions/max_terminated_length": 1290.0, + "completions/mean_length": 560.8125, + "completions/mean_terminated_length": 560.8125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.07266362575580779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.022556508192792535, + "learning_rate": 7.7264e-06, + "loss": 0.1027, + "num_tokens": 31787141.0, + "reward": 2.8888206481933594, + "reward_std": 0.36274033784866333, + "rewards/reward_fn/mean": 2.8888206481933594, + "rewards/reward_fn/std": 0.3627403676509857, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1064.65625, + "completions/mean_terminated_length": 736.875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.07276970404158269, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.013749472331255674, + "learning_rate": 7.726e-06, + "loss": 0.3949, + "num_tokens": 31860570.0, + "reward": 2.409977436065674, + "reward_std": 1.4189443588256836, + "rewards/reward_fn/mean": 2.409977436065674, + "rewards/reward_fn/std": 1.4189443588256836, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 227.40625, + "completions/mean_terminated_length": 227.40625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.07287578232735759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.021269621676765382, + "learning_rate": 7.7256e-06, + "loss": 0.0009, + "num_tokens": 31896007.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1312.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 328.40625, + "completions/mean_terminated_length": 328.40625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.07298186061313248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.03248383407481015, + "learning_rate": 7.7252e-06, + "loss": 0.2071, + "num_tokens": 31945428.0, + "reward": 3.6226577758789062, + "reward_std": 0.5695524215698242, + "rewards/reward_fn/mean": 3.6226577758789062, + "rewards/reward_fn/std": 0.5695523619651794, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.0730879388989074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.024565639439970255, + "learning_rate": 7.7248e-06, + "loss": -0.0209, + "num_tokens": 32000332.0, + "reward": 3.962454319000244, + "reward_std": 0.2123897820711136, + "rewards/reward_fn/mean": 3.962454319000244, + "rewards/reward_fn/std": 0.21238979697227478, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 306.09375, + "completions/mean_terminated_length": 306.09375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.0731940171846823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.019663402810692787, + "learning_rate": 7.7244e-06, + "loss": 0.0733, + "num_tokens": 32058191.0, + "reward": 3.1262402534484863, + "reward_std": 0.28920337557792664, + "rewards/reward_fn/mean": 3.1262402534484863, + "rewards/reward_fn/std": 0.289203405380249, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 489.0625, + "completions/mean_terminated_length": 489.0625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.0733000954704572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.022890981985256076, + "learning_rate": 7.724e-06, + "loss": -0.002, + "num_tokens": 32106673.0, + "reward": 3.150282144546509, + "reward_std": 0.45853471755981445, + "rewards/reward_fn/mean": 3.150282144546509, + "rewards/reward_fn/std": 0.4585346579551697, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 514.90625, + "completions/mean_terminated_length": 412.70001220703125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.0734061737562321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.021816590800881386, + "learning_rate": 7.7236e-06, + "loss": 0.1898, + "num_tokens": 32143982.0, + "reward": 2.987020969390869, + "reward_std": 1.2113451957702637, + "rewards/reward_fn/mean": 2.987020969390869, + "rewards/reward_fn/std": 1.2113451957702637, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 559.5, + "completions/mean_terminated_length": 460.2666931152344, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.073512252042007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.02110014483332634, + "learning_rate": 7.7232e-06, + "loss": 0.2636, + "num_tokens": 32183230.0, + "reward": 2.5472910404205322, + "reward_std": 0.872420072555542, + "rewards/reward_fn/mean": 2.5472910404205322, + "rewards/reward_fn/std": 0.872420072555542, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1967.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 708.1875, + "completions/mean_terminated_length": 708.1875, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.07361833032778191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.015690243802964687, + "learning_rate": 7.7228e-06, + "loss": 0.1217, + "num_tokens": 32219076.0, + "reward": 2.446472644805908, + "reward_std": 0.4943988025188446, + "rewards/reward_fn/mean": 2.446472644805908, + "rewards/reward_fn/std": 0.4943988025188446, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 326.84375, + "completions/mean_terminated_length": 326.84375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.07372440861355681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.02663695439696312, + "learning_rate": 7.7224e-06, + "loss": 0.0019, + "num_tokens": 32266015.0, + "reward": 3.3688626289367676, + "reward_std": 0.5659106373786926, + "rewards/reward_fn/mean": 3.3688626289367676, + "rewards/reward_fn/std": 0.5659106373786926, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 169.5625, + "completions/mean_terminated_length": 169.5625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.07383048689933171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.025914974743500352, + "learning_rate": 7.722e-06, + "loss": 0.001, + "num_tokens": 32321521.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 366.65625, + "completions/mean_terminated_length": 366.65625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.0739365651851066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.022461338434368372, + "learning_rate": 7.721599999999999e-06, + "loss": 0.0393, + "num_tokens": 32353830.0, + "reward": 2.6947999000549316, + "reward_std": 0.4774615168571472, + "rewards/reward_fn/mean": 2.6947999000549316, + "rewards/reward_fn/std": 0.47746148705482483, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 201.5625, + "completions/mean_terminated_length": 201.5625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.0740426434708815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.030215767910704017, + "learning_rate": 7.7212e-06, + "loss": 0.0134, + "num_tokens": 32404856.0, + "reward": 3.9019250869750977, + "reward_std": 0.41585031151771545, + "rewards/reward_fn/mean": 3.9019250869750977, + "rewards/reward_fn/std": 0.41585028171539307, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 203.59375, + "completions/mean_terminated_length": 203.59375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07414872175665642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.022973356302827597, + "learning_rate": 7.720799999999999e-06, + "loss": -0.0069, + "num_tokens": 32430059.0, + "reward": 3.9313559532165527, + "reward_std": 0.38830989599227905, + "rewards/reward_fn/mean": 3.9313559532165527, + "rewards/reward_fn/std": 0.38830992579460144, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.07425480004243132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.02324189874343574, + "learning_rate": 7.7204e-06, + "loss": 0.0154, + "num_tokens": 32483995.0, + "reward": 2.4519386291503906, + "reward_std": 0.3796447217464447, + "rewards/reward_fn/mean": 2.4519386291503906, + "rewards/reward_fn/std": 0.3796447217464447, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 108.46875, + "completions/mean_terminated_length": 108.46875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.07436087832820622, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1513671875, + "kl": 0.02412488660775125, + "learning_rate": 7.719999999999999e-06, + "loss": 0.001, + "num_tokens": 32525034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 168.1875, + "completions/mean_terminated_length": 168.1875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.07446695661398112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.026652783853933215, + "learning_rate": 7.7196e-06, + "loss": 0.0011, + "num_tokens": 32562288.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 288.71875, + "completions/mean_terminated_length": 288.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.07457303489975602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.023433730704709888, + "learning_rate": 7.719199999999999e-06, + "loss": -0.1515, + "num_tokens": 32607111.0, + "reward": 2.775975227355957, + "reward_std": 0.7966500520706177, + "rewards/reward_fn/mean": 2.775975227355957, + "rewards/reward_fn/std": 0.7966500520706177, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 549.375, + "completions/mean_terminated_length": 549.375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.07467911318553092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.02032529003918171, + "learning_rate": 7.7188e-06, + "loss": 0.0651, + "num_tokens": 32661107.0, + "reward": 2.5700185298919678, + "reward_std": 0.8670512437820435, + "rewards/reward_fn/mean": 2.5700185298919678, + "rewards/reward_fn/std": 0.8670512437820435, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1800.0, + "completions/max_terminated_length": 1800.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 446.375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.07478519147130583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.015976089402101934, + "learning_rate": 7.718399999999999e-06, + "loss": -0.0692, + "num_tokens": 32709343.0, + "reward": 3.776334285736084, + "reward_std": 0.7065488696098328, + "rewards/reward_fn/mean": 3.776334285736084, + "rewards/reward_fn/std": 0.706548810005188, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 250.6875, + "completions/mean_terminated_length": 250.6875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.07489126975708073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.02156626316718757, + "learning_rate": 7.718e-06, + "loss": 0.0282, + "num_tokens": 32751189.0, + "reward": 3.279935359954834, + "reward_std": 0.8665456771850586, + "rewards/reward_fn/mean": 3.279935359954834, + "rewards/reward_fn/std": 0.8665456175804138, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 203.4375, + "completions/mean_terminated_length": 203.4375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.07499734804285563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.031126373447477818, + "learning_rate": 7.7176e-06, + "loss": 0.0012, + "num_tokens": 32808963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 283.96875, + "completions/mean_terminated_length": 283.96875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.07510342632863053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.02349319658242166, + "learning_rate": 7.7172e-06, + "loss": 0.0009, + "num_tokens": 32850274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 175.03125, + "completions/mean_terminated_length": 175.03125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.07520950461440543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.021260776673443615, + "learning_rate": 7.7168e-06, + "loss": 0.0009, + "num_tokens": 32886243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 252.46875, + "completions/mean_terminated_length": 252.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.07531558290018034, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.025537249632179737, + "learning_rate": 7.7164e-06, + "loss": 0.0669, + "num_tokens": 32930514.0, + "reward": 2.982074737548828, + "reward_std": 0.7072264552116394, + "rewards/reward_fn/mean": 2.982074737548828, + "rewards/reward_fn/std": 0.7072264552116394, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 413.5, + "completions/mean_terminated_length": 413.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.07542166118595524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0188356313155964, + "learning_rate": 7.716e-06, + "loss": 0.1318, + "num_tokens": 32981026.0, + "reward": 2.7349185943603516, + "reward_std": 0.4819990396499634, + "rewards/reward_fn/mean": 2.7349185943603516, + "rewards/reward_fn/std": 0.48199906945228577, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 225.03125, + "completions/mean_terminated_length": 225.03125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.07552773947173014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.011965643963776529, + "learning_rate": 7.7156e-06, + "loss": 0.0195, + "num_tokens": 33032067.0, + "reward": 3.974524974822998, + "reward_std": 0.10026301443576813, + "rewards/reward_fn/mean": 3.974524974822998, + "rewards/reward_fn/std": 0.10026300698518753, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 248.59375, + "completions/mean_terminated_length": 248.59375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.07563381775750504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.022213632240891457, + "learning_rate": 7.7152e-06, + "loss": 0.0989, + "num_tokens": 33063670.0, + "reward": 3.016145944595337, + "reward_std": 0.5643318295478821, + "rewards/reward_fn/mean": 3.016145944595337, + "rewards/reward_fn/std": 0.5643318295478821, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.07573989604327994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.032502480084076524, + "learning_rate": 7.7148e-06, + "loss": -0.0117, + "num_tokens": 33104906.0, + "reward": 3.8165884017944336, + "reward_std": 0.3915080428123474, + "rewards/reward_fn/mean": 3.8165884017944336, + "rewards/reward_fn/std": 0.3915080726146698, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 203.03125, + "completions/mean_terminated_length": 203.03125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.07584597432905485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.01976885157637298, + "learning_rate": 7.7144e-06, + "loss": 0.0008, + "num_tokens": 33132939.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 146.34375, + "completions/mean_terminated_length": 146.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.07595205261482975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.0265513202175498, + "learning_rate": 7.714e-06, + "loss": 0.0011, + "num_tokens": 33170582.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 281.71875, + "completions/mean_terminated_length": 281.71875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.07605813090060465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.023149944143369794, + "learning_rate": 7.713599999999998e-06, + "loss": -0.0468, + "num_tokens": 33219533.0, + "reward": 3.8506064414978027, + "reward_std": 0.5878912210464478, + "rewards/reward_fn/mean": 3.8506064414978027, + "rewards/reward_fn/std": 0.5878912210464478, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 408.9375, + "completions/mean_terminated_length": 408.9375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.07616420918637955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.020193473668769002, + "learning_rate": 7.7132e-06, + "loss": 0.1427, + "num_tokens": 33275499.0, + "reward": 3.861990451812744, + "reward_std": 0.3711107075214386, + "rewards/reward_fn/mean": 3.861990451812744, + "rewards/reward_fn/std": 0.371110737323761, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 311.1875, + "completions/mean_terminated_length": 311.1875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.07627028747215445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.01826619717758149, + "learning_rate": 7.7128e-06, + "loss": 0.0951, + "num_tokens": 33316369.0, + "reward": 3.852308511734009, + "reward_std": 0.5811760425567627, + "rewards/reward_fn/mean": 3.852308511734009, + "rewards/reward_fn/std": 0.5811761021614075, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 93.78125, + "completions/mean_terminated_length": 93.78125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.07637636575792935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.025791630847379565, + "learning_rate": 7.7124e-06, + "loss": 0.001, + "num_tokens": 33357226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1839.0, + "completions/mean_length": 664.59375, + "completions/mean_terminated_length": 619.9677124023438, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.07648244404370426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.014834558707661927, + "learning_rate": 7.712e-06, + "loss": 0.1671, + "num_tokens": 33426845.0, + "reward": 3.423288345336914, + "reward_std": 0.9050231575965881, + "rewards/reward_fn/mean": 3.423288345336914, + "rewards/reward_fn/std": 0.9050231575965881, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 492.0625, + "completions/mean_terminated_length": 492.0625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07658852232947916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.01824855396989733, + "learning_rate": 7.711599999999999e-06, + "loss": 0.0541, + "num_tokens": 33483199.0, + "reward": 2.606292724609375, + "reward_std": 0.2552023231983185, + "rewards/reward_fn/mean": 2.606292724609375, + "rewards/reward_fn/std": 0.2552023231983185, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1811.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 370.625, + "completions/mean_terminated_length": 370.625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.07669460061525406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.021258773282170296, + "learning_rate": 7.7112e-06, + "loss": -0.0289, + "num_tokens": 33531699.0, + "reward": 1.7273921966552734, + "reward_std": 0.20154969394207, + "rewards/reward_fn/mean": 1.7273921966552734, + "rewards/reward_fn/std": 0.2015496790409088, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 275.5625, + "completions/mean_terminated_length": 275.5625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.07680067890102896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.018158360850065947, + "learning_rate": 7.710799999999999e-06, + "loss": -0.0268, + "num_tokens": 33571397.0, + "reward": 2.812833547592163, + "reward_std": 0.2651287913322449, + "rewards/reward_fn/mean": 2.812833547592163, + "rewards/reward_fn/std": 0.2651287913322449, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 508.9375, + "completions/mean_terminated_length": 459.2903137207031, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.07690675718680386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.02326313848607242, + "learning_rate": 7.7104e-06, + "loss": 0.2781, + "num_tokens": 33636419.0, + "reward": 2.658267021179199, + "reward_std": 0.5596959590911865, + "rewards/reward_fn/mean": 2.658267021179199, + "rewards/reward_fn/std": 0.5596958994865417, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 272.8125, + "completions/mean_terminated_length": 272.8125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.07701283547257877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.015897757722996175, + "learning_rate": 7.709999999999999e-06, + "loss": 0.0006, + "num_tokens": 33686141.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1789.0, + "completions/max_terminated_length": 1789.0, + "completions/mean_length": 474.3125, + "completions/mean_terminated_length": 474.3125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.07711891375835367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.016974500380456448, + "learning_rate": 7.7096e-06, + "loss": -0.0312, + "num_tokens": 33745319.0, + "reward": 3.0468087196350098, + "reward_std": 0.5193257927894592, + "rewards/reward_fn/mean": 3.0468087196350098, + "rewards/reward_fn/std": 0.5193257331848145, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 348.0, + "completions/mean_terminated_length": 348.0, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.07722499204412857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.01765015278942883, + "learning_rate": 7.709199999999999e-06, + "loss": -0.0524, + "num_tokens": 33797127.0, + "reward": 3.139650821685791, + "reward_std": 0.5897535681724548, + "rewards/reward_fn/mean": 3.139650821685791, + "rewards/reward_fn/std": 0.5897536277770996, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 532.4375, + "completions/mean_terminated_length": 483.5483703613281, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.07733107032990347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.017708882922306657, + "learning_rate": 7.7088e-06, + "loss": 0.2641, + "num_tokens": 33851317.0, + "reward": 2.595705986022949, + "reward_std": 0.5950980186462402, + "rewards/reward_fn/mean": 2.595705986022949, + "rewards/reward_fn/std": 0.595098078250885, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 321.4375, + "completions/mean_terminated_length": 321.4375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.07743714861567837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.022806552005931735, + "learning_rate": 7.7084e-06, + "loss": -0.1221, + "num_tokens": 33880131.0, + "reward": 3.959925651550293, + "reward_std": 0.226694256067276, + "rewards/reward_fn/mean": 3.959925651550293, + "rewards/reward_fn/std": 0.2266942858695984, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 309.5625, + "completions/mean_terminated_length": 309.5625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.07754322690145327, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.018843404366634786, + "learning_rate": 7.708e-06, + "loss": 0.2778, + "num_tokens": 33933077.0, + "reward": 3.921745777130127, + "reward_std": 0.30792757868766785, + "rewards/reward_fn/mean": 3.921745777130127, + "rewards/reward_fn/std": 0.30792760848999023, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 102.9375, + "completions/mean_terminated_length": 102.9375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.07764930518722818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.023415476083755493, + "learning_rate": 7.7076e-06, + "loss": 0.0009, + "num_tokens": 33957107.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 468.9375, + "completions/mean_terminated_length": 468.9375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.07775538347300308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.019135850947350264, + "learning_rate": 7.7072e-06, + "loss": 0.0913, + "num_tokens": 34012753.0, + "reward": 3.1053218841552734, + "reward_std": 0.46024301648139954, + "rewards/reward_fn/mean": 3.1053218841552734, + "rewards/reward_fn/std": 0.46024298667907715, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 418.90625, + "completions/mean_terminated_length": 366.3548278808594, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.07786146175877798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.017279536346904933, + "learning_rate": 7.7068e-06, + "loss": 0.3037, + "num_tokens": 34061870.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 606.53125, + "completions/mean_terminated_length": 560.0322265625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.07796754004455288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.014971327618695796, + "learning_rate": 7.7064e-06, + "loss": 0.1449, + "num_tokens": 34126623.0, + "reward": 2.847194194793701, + "reward_std": 0.6151965260505676, + "rewards/reward_fn/mean": 2.847194194793701, + "rewards/reward_fn/std": 0.6151964664459229, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 108.0, + "completions/mean_terminated_length": 108.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.07807361833032778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.013533458928577602, + "learning_rate": 7.706e-06, + "loss": 0.0005, + "num_tokens": 34173343.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 296.53125, + "completions/mean_terminated_length": 296.53125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.07817969661610269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.018062757910229266, + "learning_rate": 7.7056e-06, + "loss": 0.0007, + "num_tokens": 34217328.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 395.125, + "completions/mean_terminated_length": 395.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.07828577490187759, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.014009194332174957, + "learning_rate": 7.705199999999999e-06, + "loss": -0.0219, + "num_tokens": 34247092.0, + "reward": 2.814393997192383, + "reward_std": 0.20588137209415436, + "rewards/reward_fn/mean": 2.814393997192383, + "rewards/reward_fn/std": 0.20588135719299316, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1819.0, + "completions/max_terminated_length": 1819.0, + "completions/mean_length": 479.1875, + "completions/mean_terminated_length": 479.1875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.07839185318765249, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.016489942325279117, + "learning_rate": 7.7048e-06, + "loss": 0.0784, + "num_tokens": 34301050.0, + "reward": 2.9646849632263184, + "reward_std": 0.4474869966506958, + "rewards/reward_fn/mean": 2.9646849632263184, + "rewards/reward_fn/std": 0.4474869966506958, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 460.28125, + "completions/mean_terminated_length": 409.06451416015625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.07849793147342739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.021060561761260033, + "learning_rate": 7.704399999999999e-06, + "loss": 0.2307, + "num_tokens": 34379779.0, + "reward": 3.593684673309326, + "reward_std": 0.8480998873710632, + "rewards/reward_fn/mean": 3.593684673309326, + "rewards/reward_fn/std": 0.8480998873710632, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 234.625, + "completions/mean_terminated_length": 234.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.07860400975920229, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.026116218185052276, + "learning_rate": 7.704e-06, + "loss": -0.3654, + "num_tokens": 34420247.0, + "reward": 3.2040047645568848, + "reward_std": 0.2753956615924835, + "rewards/reward_fn/mean": 3.2040047645568848, + "rewards/reward_fn/std": 0.27539563179016113, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 304.15625, + "completions/mean_terminated_length": 304.15625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.0787100880449772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.0184671861352399, + "learning_rate": 7.7036e-06, + "loss": -0.0289, + "num_tokens": 34460348.0, + "reward": 3.8210489749908447, + "reward_std": 0.423511803150177, + "rewards/reward_fn/mean": 3.8210489749908447, + "rewards/reward_fn/std": 0.4235118627548218, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1977.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 362.90625, + "completions/mean_terminated_length": 362.90625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.0788161663307521, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.02415016118902713, + "learning_rate": 7.7032e-06, + "loss": 0.0105, + "num_tokens": 34512857.0, + "reward": 3.9323811531066895, + "reward_std": 0.2660841941833496, + "rewards/reward_fn/mean": 3.9323811531066895, + "rewards/reward_fn/std": 0.2660841643810272, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 401.65625, + "completions/mean_terminated_length": 401.65625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.078922244616527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.022820353973656893, + "learning_rate": 7.7028e-06, + "loss": 0.0953, + "num_tokens": 34545998.0, + "reward": 2.9798271656036377, + "reward_std": 0.3881252706050873, + "rewards/reward_fn/mean": 2.9798271656036377, + "rewards/reward_fn/std": 0.38812533020973206, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 161.46875, + "completions/mean_terminated_length": 161.46875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.0790283229023019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.018742251209914684, + "learning_rate": 7.7024e-06, + "loss": 0.0007, + "num_tokens": 34589949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 347.75, + "completions/mean_terminated_length": 347.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.0791344011880768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.016581954550929368, + "learning_rate": 7.702e-06, + "loss": 0.0007, + "num_tokens": 34637141.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1318.0, + "completions/mean_length": 413.0625, + "completions/mean_terminated_length": 360.32257080078125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.0792404794738517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01734267408028245, + "learning_rate": 7.7016e-06, + "loss": 0.1899, + "num_tokens": 34671959.0, + "reward": 3.5228824615478516, + "reward_std": 0.6560260057449341, + "rewards/reward_fn/mean": 3.5228824615478516, + "rewards/reward_fn/std": 0.6560259461402893, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 469.125, + "completions/mean_terminated_length": 363.86669921875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.07934655775962661, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.016482632607221603, + "learning_rate": 7.7012e-06, + "loss": 0.3411, + "num_tokens": 34707547.0, + "reward": 2.742053508758545, + "reward_std": 0.8046372532844543, + "rewards/reward_fn/mean": 2.742053508758545, + "rewards/reward_fn/std": 0.8046371936798096, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 248.625, + "completions/mean_terminated_length": 248.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.07945263604540151, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.02026499854400754, + "learning_rate": 7.7008e-06, + "loss": 0.0099, + "num_tokens": 34761615.0, + "reward": 3.801016330718994, + "reward_std": 0.42593979835510254, + "rewards/reward_fn/mean": 3.801016330718994, + "rewards/reward_fn/std": 0.42593976855278015, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 186.3125, + "completions/mean_terminated_length": 186.3125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.07955871433117641, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.015264550573192537, + "learning_rate": 7.7004e-06, + "loss": -0.0742, + "num_tokens": 34811801.0, + "reward": 3.933756113052368, + "reward_std": 0.3747324049472809, + "rewards/reward_fn/mean": 3.933756113052368, + "rewards/reward_fn/std": 0.37473243474960327, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1936.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 328.8125, + "completions/mean_terminated_length": 328.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.07966479261695131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.01918186468537897, + "learning_rate": 7.699999999999999e-06, + "loss": 0.0008, + "num_tokens": 34858419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1659.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 412.8125, + "completions/mean_terminated_length": 412.8125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.07977087090272621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.02011913899332285, + "learning_rate": 7.6996e-06, + "loss": -0.0163, + "num_tokens": 34923021.0, + "reward": 3.248586893081665, + "reward_std": 0.7823631763458252, + "rewards/reward_fn/mean": 3.248586893081665, + "rewards/reward_fn/std": 0.7823631763458252, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.07987694918850112, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.02701102104038, + "learning_rate": 7.6992e-06, + "loss": 0.1138, + "num_tokens": 34963833.0, + "reward": 3.7283644676208496, + "reward_std": 0.5220069885253906, + "rewards/reward_fn/mean": 3.7283644676208496, + "rewards/reward_fn/std": 0.5220070481300354, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1733.0, + "completions/max_terminated_length": 1733.0, + "completions/mean_length": 799.8125, + "completions/mean_terminated_length": 799.8125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.07998302747427602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.013593915617093444, + "learning_rate": 7.6988e-06, + "loss": 0.0106, + "num_tokens": 35034419.0, + "reward": 2.5703787803649902, + "reward_std": 0.6685802340507507, + "rewards/reward_fn/mean": 2.5703787803649902, + "rewards/reward_fn/std": 0.668580174446106, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1257.0, + "completions/max_terminated_length": 1257.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.08008910576005092, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.017514656181447208, + "learning_rate": 7.6984e-06, + "loss": 0.3551, + "num_tokens": 35078991.0, + "reward": 3.92777681350708, + "reward_std": 0.4085560739040375, + "rewards/reward_fn/mean": 3.92777681350708, + "rewards/reward_fn/std": 0.4085560142993927, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.08019518404582582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.022400660207495093, + "learning_rate": 7.698e-06, + "loss": 0.0009, + "num_tokens": 35115003.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 245.65625, + "completions/mean_terminated_length": 245.65625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.08030126233160072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.01861444814130664, + "learning_rate": 7.6976e-06, + "loss": 0.0078, + "num_tokens": 35159408.0, + "reward": 2.909533977508545, + "reward_std": 0.039463140070438385, + "rewards/reward_fn/mean": 2.909533977508545, + "rewards/reward_fn/std": 0.03946312144398689, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 254.03125, + "completions/mean_terminated_length": 254.03125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.08040734061737562, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.018796015181578696, + "learning_rate": 7.6972e-06, + "loss": 0.0522, + "num_tokens": 35185681.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 317.6875, + "completions/mean_terminated_length": 317.6875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.08051341890315053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.015750034246593714, + "learning_rate": 7.696799999999999e-06, + "loss": 0.0456, + "num_tokens": 35228807.0, + "reward": 3.8852972984313965, + "reward_std": 0.3084171712398529, + "rewards/reward_fn/mean": 3.8852972984313965, + "rewards/reward_fn/std": 0.3084172010421753, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 1313.0, + "completions/mean_length": 429.1875, + "completions/mean_terminated_length": 429.1875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.08061949718892543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.016051248530857265, + "learning_rate": 7.6964e-06, + "loss": 0.0066, + "num_tokens": 35264141.0, + "reward": 3.350024700164795, + "reward_std": 0.5828197002410889, + "rewards/reward_fn/mean": 3.350024700164795, + "rewards/reward_fn/std": 0.5828196406364441, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 641.0, + "completions/mean_terminated_length": 495.4482727050781, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.08072557547470033, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.01636024343315512, + "learning_rate": 7.695999999999999e-06, + "loss": 0.2756, + "num_tokens": 35323661.0, + "reward": 3.2801287174224854, + "reward_std": 1.179065227508545, + "rewards/reward_fn/mean": 3.2801287174224854, + "rewards/reward_fn/std": 1.179065227508545, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1244.0, + "completions/mean_length": 338.78125, + "completions/mean_terminated_length": 283.6451416015625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.08083165376047523, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.023242034018039703, + "learning_rate": 7.6956e-06, + "loss": 0.3512, + "num_tokens": 35355654.0, + "reward": 3.727604866027832, + "reward_std": 0.7867724895477295, + "rewards/reward_fn/mean": 3.727604866027832, + "rewards/reward_fn/std": 0.7867724895477295, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 434.65625, + "completions/mean_terminated_length": 382.6128845214844, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.08093773204625013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.020363064482808113, + "learning_rate": 7.695199999999999e-06, + "loss": 0.166, + "num_tokens": 35397915.0, + "reward": 2.0777950286865234, + "reward_std": 0.674912691116333, + "rewards/reward_fn/mean": 2.0777950286865234, + "rewards/reward_fn/std": 0.6749126315116882, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.0, + "completions/max_terminated_length": 1564.0, + "completions/mean_length": 552.4375, + "completions/mean_terminated_length": 552.4375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.08104381033202504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.01603428611997515, + "learning_rate": 7.6948e-06, + "loss": 0.0247, + "num_tokens": 35448105.0, + "reward": 2.9090466499328613, + "reward_std": 0.07430879026651382, + "rewards/reward_fn/mean": 2.9090466499328613, + "rewards/reward_fn/std": 0.07430876046419144, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 366.125, + "completions/mean_terminated_length": 311.8709716796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.08114988861779994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.019333304720930755, + "learning_rate": 7.6944e-06, + "loss": 0.2851, + "num_tokens": 35477549.0, + "reward": 3.4079909324645996, + "reward_std": 0.7734149098396301, + "rewards/reward_fn/mean": 3.4079909324645996, + "rewards/reward_fn/std": 0.7734148502349854, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 350.1875, + "completions/mean_terminated_length": 350.1875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.08125596690357484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.020013973116874695, + "learning_rate": 7.694e-06, + "loss": 0.0958, + "num_tokens": 35539795.0, + "reward": 3.3708925247192383, + "reward_std": 0.7116749286651611, + "rewards/reward_fn/mean": 3.3708925247192383, + "rewards/reward_fn/std": 0.7116749286651611, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 510.46875, + "completions/mean_terminated_length": 460.8709411621094, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.08136204518934974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.013919757795520127, + "learning_rate": 7.6936e-06, + "loss": 0.2956, + "num_tokens": 35605730.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 571.15625, + "completions/mean_terminated_length": 523.51611328125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.08146812347512464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.019678838085383177, + "learning_rate": 7.6932e-06, + "loss": 0.0894, + "num_tokens": 35654567.0, + "reward": 2.3275630474090576, + "reward_std": 0.6938040256500244, + "rewards/reward_fn/mean": 2.3275630474090576, + "rewards/reward_fn/std": 0.6938039660453796, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 202.84375, + "completions/mean_terminated_length": 202.84375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.08157420176089955, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.019846069626510143, + "learning_rate": 7.6928e-06, + "loss": 0.0203, + "num_tokens": 35690050.0, + "reward": 3.797757625579834, + "reward_std": 0.4278443157672882, + "rewards/reward_fn/mean": 3.797757625579834, + "rewards/reward_fn/std": 0.4278443455696106, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1466.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 375.28125, + "completions/mean_terminated_length": 375.28125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.08168028004667445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.022206757916137576, + "learning_rate": 7.6924e-06, + "loss": -0.0462, + "num_tokens": 35735115.0, + "reward": 2.7710344791412354, + "reward_std": 0.07037150859832764, + "rewards/reward_fn/mean": 2.7710344791412354, + "rewards/reward_fn/std": 0.07037156820297241, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 173.28125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.08178635833244935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.02803934703115374, + "learning_rate": 7.692e-06, + "loss": 0.0011, + "num_tokens": 35761108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 257.5625, + "completions/mean_terminated_length": 257.5625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.08189243661822425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.015047145425342023, + "learning_rate": 7.6916e-06, + "loss": -0.0131, + "num_tokens": 35800998.0, + "reward": 3.958484649658203, + "reward_std": 0.23484668135643005, + "rewards/reward_fn/mean": 3.958484649658203, + "rewards/reward_fn/std": 0.23484671115875244, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 406.8125, + "completions/mean_terminated_length": 406.8125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.08199851490399915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.015709312865510583, + "learning_rate": 7.6912e-06, + "loss": 0.0006, + "num_tokens": 35847232.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 267.65625, + "completions/mean_terminated_length": 267.65625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.08210459318977405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.017633047536946833, + "learning_rate": 7.6908e-06, + "loss": 0.1638, + "num_tokens": 35907957.0, + "reward": 2.982841730117798, + "reward_std": 0.028663719072937965, + "rewards/reward_fn/mean": 2.982841730117798, + "rewards/reward_fn/std": 0.028663722798228264, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 294.3125, + "completions/mean_terminated_length": 294.3125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.08221067147554896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.015748307458125055, + "learning_rate": 7.6904e-06, + "loss": -0.0098, + "num_tokens": 35951743.0, + "reward": 3.4863507747650146, + "reward_std": 0.9863615036010742, + "rewards/reward_fn/mean": 3.4863507747650146, + "rewards/reward_fn/std": 0.9863614439964294, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1481.0, + "completions/max_terminated_length": 1481.0, + "completions/mean_length": 532.0625, + "completions/mean_terminated_length": 532.0625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.08231674976132386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.01679909380618483, + "learning_rate": 7.69e-06, + "loss": 0.0363, + "num_tokens": 36001249.0, + "reward": 3.2947301864624023, + "reward_std": 0.632847249507904, + "rewards/reward_fn/mean": 3.2947301864624023, + "rewards/reward_fn/std": 0.6328471899032593, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 421.5, + "completions/mean_terminated_length": 369.0322570800781, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.08242282804709876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.020656271022744477, + "learning_rate": 7.6896e-06, + "loss": 0.2965, + "num_tokens": 36050065.0, + "reward": 2.78275203704834, + "reward_std": 0.5125721096992493, + "rewards/reward_fn/mean": 2.78275203704834, + "rewards/reward_fn/std": 0.5125721096992493, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 440.625, + "completions/mean_terminated_length": 440.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.08252890633287366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01858210703358054, + "learning_rate": 7.6892e-06, + "loss": 0.1469, + "num_tokens": 36102053.0, + "reward": 3.6386284828186035, + "reward_std": 0.5598682761192322, + "rewards/reward_fn/mean": 3.6386284828186035, + "rewards/reward_fn/std": 0.559868335723877, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 107.40625, + "completions/mean_terminated_length": 107.40625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.08263498461864856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.017615402408409864, + "learning_rate": 7.6888e-06, + "loss": 0.0007, + "num_tokens": 36141522.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 498.15625, + "completions/mean_terminated_length": 498.15625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.08274106290442347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.020500344457104802, + "learning_rate": 7.688399999999999e-06, + "loss": 0.0002, + "num_tokens": 36191415.0, + "reward": 2.5993824005126953, + "reward_std": 0.1720532327890396, + "rewards/reward_fn/mean": 2.5993824005126953, + "rewards/reward_fn/std": 0.1720532774925232, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 277.96875, + "completions/mean_terminated_length": 277.96875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.08284714119019837, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.01927991397678852, + "learning_rate": 7.688e-06, + "loss": 0.0236, + "num_tokens": 36230550.0, + "reward": 2.7684693336486816, + "reward_std": 0.029289107769727707, + "rewards/reward_fn/mean": 2.7684693336486816, + "rewards/reward_fn/std": 0.029289091005921364, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 131.03125, + "completions/mean_terminated_length": 131.03125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.08295321947597327, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.010917730629444122, + "learning_rate": 7.687599999999999e-06, + "loss": 0.0665, + "num_tokens": 36268119.0, + "reward": 3.0935535430908203, + "reward_std": 0.016159607097506523, + "rewards/reward_fn/mean": 3.0935535430908203, + "rewards/reward_fn/std": 0.016159581020474434, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1797.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 546.78125, + "completions/mean_terminated_length": 546.78125, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.08305929776174817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.010865212534554303, + "learning_rate": 7.6872e-06, + "loss": 0.0287, + "num_tokens": 36342576.0, + "reward": 3.487700939178467, + "reward_std": 0.5917753577232361, + "rewards/reward_fn/mean": 3.487700939178467, + "rewards/reward_fn/std": 0.5917754173278809, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 360.71875, + "completions/mean_terminated_length": 360.71875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.08316537604752307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.024502220330759883, + "learning_rate": 7.686799999999999e-06, + "loss": 0.0588, + "num_tokens": 36373287.0, + "reward": 3.8351099491119385, + "reward_std": 0.3897174298763275, + "rewards/reward_fn/mean": 3.8351099491119385, + "rewards/reward_fn/std": 0.38971734046936035, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1505.0, + "completions/mean_length": 666.5625, + "completions/mean_terminated_length": 574.4666748046875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.08327145433329797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.01942377514205873, + "learning_rate": 7.6864e-06, + "loss": 0.3233, + "num_tokens": 36432665.0, + "reward": 2.4874258041381836, + "reward_std": 0.7904840111732483, + "rewards/reward_fn/mean": 2.4874258041381836, + "rewards/reward_fn/std": 0.7904840111732483, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 242.3125, + "completions/mean_terminated_length": 242.3125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.08337753261907288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.031069141812622547, + "learning_rate": 7.685999999999999e-06, + "loss": 0.0068, + "num_tokens": 36487043.0, + "reward": 3.757603168487549, + "reward_std": 0.4073527753353119, + "rewards/reward_fn/mean": 3.757603168487549, + "rewards/reward_fn/std": 0.4073527753353119, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 190.9375, + "completions/mean_terminated_length": 190.9375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.08348361090484778, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.023711836896836758, + "learning_rate": 7.6856e-06, + "loss": 0.0631, + "num_tokens": 36527585.0, + "reward": 2.923456907272339, + "reward_std": 0.28942155838012695, + "rewards/reward_fn/mean": 2.923456907272339, + "rewards/reward_fn/std": 0.28942152857780457, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.08358968919062268, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.020150523632764816, + "learning_rate": 7.685199999999999e-06, + "loss": 0.0728, + "num_tokens": 36567908.0, + "reward": 3.884533405303955, + "reward_std": 0.47682517766952515, + "rewards/reward_fn/mean": 3.884533405303955, + "rewards/reward_fn/std": 0.47682514786720276, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 202.4375, + "completions/mean_terminated_length": 202.4375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.08369576747639758, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.03790642227977514, + "learning_rate": 7.6848e-06, + "loss": 0.0211, + "num_tokens": 36607410.0, + "reward": 3.8924498558044434, + "reward_std": 0.3397402763366699, + "rewards/reward_fn/mean": 3.8924498558044434, + "rewards/reward_fn/std": 0.3397402763366699, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 501.4375, + "completions/mean_terminated_length": 501.4375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.08380184576217248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.02125004376284778, + "learning_rate": 7.6844e-06, + "loss": 0.0061, + "num_tokens": 36672960.0, + "reward": 2.670154571533203, + "reward_std": 0.5752670168876648, + "rewards/reward_fn/mean": 2.670154571533203, + "rewards/reward_fn/std": 0.57526695728302, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 455.21875, + "completions/mean_terminated_length": 455.21875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.08390792404794739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.020999554311856627, + "learning_rate": 7.684e-06, + "loss": 0.0642, + "num_tokens": 36732487.0, + "reward": 3.457927942276001, + "reward_std": 0.5872803330421448, + "rewards/reward_fn/mean": 3.457927942276001, + "rewards/reward_fn/std": 0.5872803330421448, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 186.1875, + "completions/mean_terminated_length": 186.1875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.08401400233372229, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.032360953744500875, + "learning_rate": 7.6836e-06, + "loss": 0.0013, + "num_tokens": 36765805.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 151.46875, + "completions/mean_terminated_length": 151.46875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.08412008061949719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.021889066323637962, + "learning_rate": 7.6832e-06, + "loss": 0.0009, + "num_tokens": 36804956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 199.96875, + "completions/mean_terminated_length": 199.96875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.08422615890527209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.018962694564834237, + "learning_rate": 7.6828e-06, + "loss": 0.0008, + "num_tokens": 36844315.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 337.875, + "completions/mean_terminated_length": 337.875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.08433223719104699, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.021812525810673833, + "learning_rate": 7.6824e-06, + "loss": 0.1178, + "num_tokens": 36890487.0, + "reward": 3.9017152786254883, + "reward_std": 0.3109425902366638, + "rewards/reward_fn/mean": 3.9017152786254883, + "rewards/reward_fn/std": 0.3109425902366638, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1777.0, + "completions/mean_length": 704.53125, + "completions/mean_terminated_length": 614.9666748046875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.0844383154768219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02236817847006023, + "learning_rate": 7.682e-06, + "loss": 0.3403, + "num_tokens": 36943112.0, + "reward": 2.4587063789367676, + "reward_std": 0.5394301414489746, + "rewards/reward_fn/mean": 2.4587063789367676, + "rewards/reward_fn/std": 0.5394301414489746, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 536.34375, + "completions/mean_terminated_length": 536.34375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.0845443937625968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.02385992626659572, + "learning_rate": 7.6816e-06, + "loss": 0.0694, + "num_tokens": 36997363.0, + "reward": 2.4099903106689453, + "reward_std": 0.5640437602996826, + "rewards/reward_fn/mean": 2.4099903106689453, + "rewards/reward_fn/std": 0.5640437602996826, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 149.15625, + "completions/mean_terminated_length": 149.15625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.0846504720483717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.017519668908789754, + "learning_rate": 7.681199999999999e-06, + "loss": 0.0007, + "num_tokens": 37028024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 313.40625, + "completions/mean_terminated_length": 313.40625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.0847565503341466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.022777023958042264, + "learning_rate": 7.6808e-06, + "loss": 0.0037, + "num_tokens": 37076261.0, + "reward": 3.26804256439209, + "reward_std": 0.7380927801132202, + "rewards/reward_fn/mean": 3.26804256439209, + "rewards/reward_fn/std": 0.7380927801132202, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.0848626286199215, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.04452996770851314, + "learning_rate": 7.680399999999998e-06, + "loss": 0.074, + "num_tokens": 37114917.0, + "reward": 3.9180896282196045, + "reward_std": 0.26771649718284607, + "rewards/reward_fn/mean": 3.9180896282196045, + "rewards/reward_fn/std": 0.26771649718284607, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 374.96875, + "completions/mean_terminated_length": 374.96875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.0849687069056964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.01646363304462284, + "learning_rate": 7.68e-06, + "loss": 0.0849, + "num_tokens": 37163716.0, + "reward": 2.890430450439453, + "reward_std": 0.055527813732624054, + "rewards/reward_fn/mean": 2.890430450439453, + "rewards/reward_fn/std": 0.05552782118320465, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 156.5, + "completions/mean_terminated_length": 156.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.08507478519147131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.03423686744645238, + "learning_rate": 7.6796e-06, + "loss": 0.0014, + "num_tokens": 37213556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 260.8125, + "completions/mean_terminated_length": 260.8125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.08518086347724621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.01638385117985308, + "learning_rate": 7.6792e-06, + "loss": -0.076, + "num_tokens": 37258478.0, + "reward": 3.609133243560791, + "reward_std": 0.5494344234466553, + "rewards/reward_fn/mean": 3.609133243560791, + "rewards/reward_fn/std": 0.5494344234466553, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 448.46875, + "completions/mean_terminated_length": 448.46875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.08528694176302111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.020947554847225547, + "learning_rate": 7.6788e-06, + "loss": -0.1078, + "num_tokens": 37296669.0, + "reward": 3.074223518371582, + "reward_std": 0.5923977494239807, + "rewards/reward_fn/mean": 3.074223518371582, + "rewards/reward_fn/std": 0.5923976898193359, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 110.90625, + "completions/mean_terminated_length": 110.90625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.08539302004879601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.017318370169959962, + "learning_rate": 7.6784e-06, + "loss": 0.0007, + "num_tokens": 37333658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 197.4375, + "completions/mean_terminated_length": 197.4375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.08549909833457091, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.02576698805205524, + "learning_rate": 7.678e-06, + "loss": 0.0374, + "num_tokens": 37370536.0, + "reward": 3.7869386672973633, + "reward_std": 0.6730425357818604, + "rewards/reward_fn/mean": 3.7869386672973633, + "rewards/reward_fn/std": 0.6730424761772156, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 171.59375, + "completions/mean_terminated_length": 171.59375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.08560517662034582, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.022562434431165457, + "learning_rate": 7.677599999999999e-06, + "loss": 0.017, + "num_tokens": 37412859.0, + "reward": 3.9462358951568604, + "reward_std": 0.21158860623836517, + "rewards/reward_fn/mean": 3.9462358951568604, + "rewards/reward_fn/std": 0.21158860623836517, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 177.53125, + "completions/mean_terminated_length": 177.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.08571125490612072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19921875, + "kl": 0.029298200272023678, + "learning_rate": 7.6772e-06, + "loss": 0.0012, + "num_tokens": 37452204.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 171.09375, + "completions/mean_terminated_length": 171.09375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.08581733319189562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.028025910491123796, + "learning_rate": 7.676799999999999e-06, + "loss": 0.0011, + "num_tokens": 37508335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.08592341147767052, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.020565056474879384, + "learning_rate": 7.6764e-06, + "loss": -0.0028, + "num_tokens": 37538775.0, + "reward": 3.859889507293701, + "reward_std": 0.5514804124832153, + "rewards/reward_fn/mean": 3.859889507293701, + "rewards/reward_fn/std": 0.5514804124832153, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1669.0, + "completions/max_terminated_length": 1669.0, + "completions/mean_length": 508.625, + "completions/mean_terminated_length": 508.625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.08602948976344542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.016198519384488463, + "learning_rate": 7.675999999999999e-06, + "loss": 0.1003, + "num_tokens": 37572555.0, + "reward": 3.0730252265930176, + "reward_std": 0.4106665253639221, + "rewards/reward_fn/mean": 3.0730252265930176, + "rewards/reward_fn/std": 0.4106665253639221, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 251.84375, + "completions/mean_terminated_length": 251.84375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.08613556804922032, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.024606948718428612, + "learning_rate": 7.6756e-06, + "loss": 0.2776, + "num_tokens": 37617158.0, + "reward": 3.967801332473755, + "reward_std": 0.18214285373687744, + "rewards/reward_fn/mean": 3.967801332473755, + "rewards/reward_fn/std": 0.18214282393455505, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1282.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 566.5, + "completions/mean_terminated_length": 566.5, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.08624164633499523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.028156861662864685, + "learning_rate": 7.6752e-06, + "loss": 0.0301, + "num_tokens": 37669654.0, + "reward": 2.7869529724121094, + "reward_std": 0.3411664366722107, + "rewards/reward_fn/mean": 2.7869529724121094, + "rewards/reward_fn/std": 0.3411664366722107, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 137.625, + "completions/mean_terminated_length": 137.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.08634772462077013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.021783202653750777, + "learning_rate": 7.6748e-06, + "loss": 0.0009, + "num_tokens": 37707338.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 268.15625, + "completions/mean_terminated_length": 268.15625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.08645380290654503, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.020849171094596386, + "learning_rate": 7.6744e-06, + "loss": -0.0546, + "num_tokens": 37751279.0, + "reward": 3.7785048484802246, + "reward_std": 0.6255349516868591, + "rewards/reward_fn/mean": 3.7785048484802246, + "rewards/reward_fn/std": 0.6255349516868591, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 66.34375, + "completions/mean_terminated_length": 66.34375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.08655988119231993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.01903400884475559, + "learning_rate": 7.674e-06, + "loss": 0.0008, + "num_tokens": 37785402.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 249.0625, + "completions/mean_terminated_length": 249.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.08666595947809483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.018705508206039667, + "learning_rate": 7.6736e-06, + "loss": 0.0007, + "num_tokens": 37835932.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 434.9375, + "completions/mean_terminated_length": 434.9375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.08677203776386974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.021974525414407253, + "learning_rate": 7.6732e-06, + "loss": -0.028, + "num_tokens": 37887770.0, + "reward": 3.078002452850342, + "reward_std": 0.9280657172203064, + "rewards/reward_fn/mean": 3.078002452850342, + "rewards/reward_fn/std": 0.9280656576156616, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 151.3125, + "completions/mean_terminated_length": 151.3125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.08687811604964464, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.02121459529735148, + "learning_rate": 7.672799999999999e-06, + "loss": 0.0208, + "num_tokens": 37912644.0, + "reward": 3.916912317276001, + "reward_std": 0.2673051953315735, + "rewards/reward_fn/mean": 3.916912317276001, + "rewards/reward_fn/std": 0.2673051655292511, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1782.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 596.375, + "completions/mean_terminated_length": 596.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.08698419433541954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.01679900661110878, + "learning_rate": 7.6724e-06, + "loss": 0.211, + "num_tokens": 37971696.0, + "reward": 2.634601593017578, + "reward_std": 0.8135941624641418, + "rewards/reward_fn/mean": 2.634601593017578, + "rewards/reward_fn/std": 0.8135941028594971, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 80.15625, + "completions/mean_terminated_length": 80.15625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.08709027262119444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.014007492223754525, + "learning_rate": 7.671999999999999e-06, + "loss": 0.0006, + "num_tokens": 37994645.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 587.875, + "completions/mean_terminated_length": 587.875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.08719635090696934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.016050471109338105, + "learning_rate": 7.6716e-06, + "loss": 0.0154, + "num_tokens": 38054577.0, + "reward": 2.849022388458252, + "reward_std": 0.03270436450839043, + "rewards/reward_fn/mean": 2.849022388458252, + "rewards/reward_fn/std": 0.03270438313484192, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 134.9375, + "completions/mean_terminated_length": 134.9375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.08730242919274425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.021432526409626007, + "learning_rate": 7.671199999999999e-06, + "loss": 0.0009, + "num_tokens": 38093487.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 314.75, + "completions/mean_terminated_length": 314.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.08740850747851915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.012500970042310655, + "learning_rate": 7.6708e-06, + "loss": 0.0005, + "num_tokens": 38138407.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 789.125, + "completions/mean_terminated_length": 748.51611328125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.08751458576429405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.014306797063909471, + "learning_rate": 7.6704e-06, + "loss": -0.0029, + "num_tokens": 38237739.0, + "reward": 2.5165700912475586, + "reward_std": 0.8159104585647583, + "rewards/reward_fn/mean": 2.5165700912475586, + "rewards/reward_fn/std": 0.8159104585647583, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 484.15625, + "completions/mean_terminated_length": 484.15625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.08762066405006895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.015709159546531737, + "learning_rate": 7.67e-06, + "loss": 0.1323, + "num_tokens": 38306224.0, + "reward": 2.799229860305786, + "reward_std": 0.059028059244155884, + "rewards/reward_fn/mean": 2.799229860305786, + "rewards/reward_fn/std": 0.059028033167123795, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 1373.0, + "completions/mean_length": 325.90625, + "completions/mean_terminated_length": 325.90625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.08772674233584385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.021479285322129726, + "learning_rate": 7.6696e-06, + "loss": -0.0611, + "num_tokens": 38359309.0, + "reward": 3.9237232208251953, + "reward_std": 0.30016380548477173, + "rewards/reward_fn/mean": 3.9237232208251953, + "rewards/reward_fn/std": 0.3001638352870941, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 188.53125, + "completions/mean_terminated_length": 188.53125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.08783282062161875, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.02761186519637704, + "learning_rate": 7.6692e-06, + "loss": -0.0688, + "num_tokens": 38397758.0, + "reward": 3.613966464996338, + "reward_std": 0.5824852585792542, + "rewards/reward_fn/mean": 3.613966464996338, + "rewards/reward_fn/std": 0.5824853181838989, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 178.53125, + "completions/mean_terminated_length": 178.53125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.08793889890739366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.01873406278900802, + "learning_rate": 7.6688e-06, + "loss": 0.0007, + "num_tokens": 38436335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 360.3125, + "completions/mean_terminated_length": 360.3125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.08804497719316856, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.02481435379013419, + "learning_rate": 7.6684e-06, + "loss": -0.0879, + "num_tokens": 38459545.0, + "reward": 2.863328456878662, + "reward_std": 0.4318251311779022, + "rewards/reward_fn/mean": 2.863328456878662, + "rewards/reward_fn/std": 0.43182510137557983, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 154.5, + "completions/mean_terminated_length": 154.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.08815105547894346, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "kl": 0.024363160831853747, + "learning_rate": 7.668e-06, + "loss": 0.125, + "num_tokens": 38510345.0, + "reward": 3.92673659324646, + "reward_std": 0.4144406020641327, + "rewards/reward_fn/mean": 3.92673659324646, + "rewards/reward_fn/std": 0.4144406318664551, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 254.53125, + "completions/mean_terminated_length": 254.53125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.08825713376471836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.021779751870781183, + "learning_rate": 7.6676e-06, + "loss": -0.0541, + "num_tokens": 38568538.0, + "reward": 3.9609336853027344, + "reward_std": 0.2209915965795517, + "rewards/reward_fn/mean": 3.9609336853027344, + "rewards/reward_fn/std": 0.2209915667772293, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 539.09375, + "completions/mean_terminated_length": 490.4193420410156, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.08836321205049326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.01555003086104989, + "learning_rate": 7.6672e-06, + "loss": 0.1733, + "num_tokens": 38624541.0, + "reward": 2.5395612716674805, + "reward_std": 0.5905151963233948, + "rewards/reward_fn/mean": 2.5395612716674805, + "rewards/reward_fn/std": 0.5905151963233948, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 550.09375, + "completions/mean_terminated_length": 501.774169921875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.08846929033626817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.011983388103544712, + "learning_rate": 7.6668e-06, + "loss": 0.1678, + "num_tokens": 38679872.0, + "reward": 2.6343441009521484, + "reward_std": 0.5172379016876221, + "rewards/reward_fn/mean": 2.6343441009521484, + "rewards/reward_fn/std": 0.5172379016876221, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 458.5, + "completions/mean_terminated_length": 458.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.08857536862204307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.015996047877706587, + "learning_rate": 7.6664e-06, + "loss": 0.0006, + "num_tokens": 38727152.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1978.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 492.3125, + "completions/mean_terminated_length": 492.3125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.08868144690781797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.017040529986843467, + "learning_rate": 7.666e-06, + "loss": 0.1084, + "num_tokens": 38779738.0, + "reward": 3.5246658325195312, + "reward_std": 0.5842354893684387, + "rewards/reward_fn/mean": 3.5246658325195312, + "rewards/reward_fn/std": 0.584235429763794, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 243.46875, + "completions/mean_terminated_length": 243.46875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.08878752519359287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.01812937785871327, + "learning_rate": 7.6656e-06, + "loss": 0.0007, + "num_tokens": 38807785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 233.59375, + "completions/mean_terminated_length": 233.59375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.08889360347936777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02154145378153771, + "learning_rate": 7.665199999999999e-06, + "loss": 0.0009, + "num_tokens": 38860764.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1264.0, + "completions/max_terminated_length": 1264.0, + "completions/mean_length": 482.875, + "completions/mean_terminated_length": 482.875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.08899968176514267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.02117840899154544, + "learning_rate": 7.6648e-06, + "loss": 0.0261, + "num_tokens": 38892632.0, + "reward": 2.9815337657928467, + "reward_std": 0.5712193846702576, + "rewards/reward_fn/mean": 2.9815337657928467, + "rewards/reward_fn/std": 0.5712193250656128, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 96.15625, + "completions/mean_terminated_length": 96.15625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.08910576005091758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.012993088574148715, + "learning_rate": 7.664399999999999e-06, + "loss": 0.0005, + "num_tokens": 38921501.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 292.4375, + "completions/mean_terminated_length": 292.4375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.08921183833669248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.023782695876434445, + "learning_rate": 7.664e-06, + "loss": 0.001, + "num_tokens": 38963435.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 251.90625, + "completions/mean_terminated_length": 251.90625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.08931791662246738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.021294391248375177, + "learning_rate": 7.663599999999999e-06, + "loss": -0.0354, + "num_tokens": 39018280.0, + "reward": 3.854649305343628, + "reward_std": 0.3912486135959625, + "rewards/reward_fn/mean": 3.854649305343628, + "rewards/reward_fn/std": 0.39124858379364014, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1967.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 521.6875, + "completions/mean_terminated_length": 521.6875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.08942399490824228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.02067981311120093, + "learning_rate": 7.6632e-06, + "loss": 0.0552, + "num_tokens": 39071550.0, + "reward": 2.960012674331665, + "reward_std": 0.3984401524066925, + "rewards/reward_fn/mean": 2.960012674331665, + "rewards/reward_fn/std": 0.3984401226043701, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1737.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 478.0, + "completions/mean_terminated_length": 478.0, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.08953007319401718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.019556208280846477, + "learning_rate": 7.662799999999999e-06, + "loss": 0.054, + "num_tokens": 39119614.0, + "reward": 1.653461217880249, + "reward_std": 0.041531752794981, + "rewards/reward_fn/mean": 1.653461217880249, + "rewards/reward_fn/std": 0.041531749069690704, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 608.5625, + "completions/mean_terminated_length": 562.1290283203125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.08963615147979209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.017408509156666696, + "learning_rate": 7.6624e-06, + "loss": 0.1428, + "num_tokens": 39176944.0, + "reward": 2.5016188621520996, + "reward_std": 0.5439700484275818, + "rewards/reward_fn/mean": 2.5016188621520996, + "rewards/reward_fn/std": 0.5439700484275818, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 98.9375, + "completions/mean_terminated_length": 98.9375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.08974222976556699, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "kl": 0.015259308856911957, + "learning_rate": 7.661999999999999e-06, + "loss": 0.0593, + "num_tokens": 39214222.0, + "reward": 2.9853861331939697, + "reward_std": 0.0448482483625412, + "rewards/reward_fn/mean": 2.9853861331939697, + "rewards/reward_fn/std": 0.04484826698899269, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 305.65625, + "completions/mean_terminated_length": 305.65625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.08984830805134189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.267578125, + "kl": 0.020439033047296107, + "learning_rate": 7.6616e-06, + "loss": 0.0008, + "num_tokens": 39262947.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.08995438633711679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.016530890949070454, + "learning_rate": 7.6612e-06, + "loss": 0.0007, + "num_tokens": 39315562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 934.9375, + "completions/mean_terminated_length": 860.7333984375, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.09006046462289169, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.011996885878033936, + "learning_rate": 7.6608e-06, + "loss": 0.1388, + "num_tokens": 39382120.0, + "reward": 2.5481839179992676, + "reward_std": 0.7321963906288147, + "rewards/reward_fn/mean": 2.5481839179992676, + "rewards/reward_fn/std": 0.7321963906288147, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1143.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 247.875, + "completions/mean_terminated_length": 247.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.0901665429086666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.025268358644098043, + "learning_rate": 7.6604e-06, + "loss": -0.0004, + "num_tokens": 39427172.0, + "reward": 3.895744800567627, + "reward_std": 0.4322659969329834, + "rewards/reward_fn/mean": 3.895744800567627, + "rewards/reward_fn/std": 0.43226608633995056, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 307.03125, + "completions/mean_terminated_length": 307.03125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.0902726211944415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.018672091653570533, + "learning_rate": 7.66e-06, + "loss": 0.0007, + "num_tokens": 39471077.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 302.4375, + "completions/mean_terminated_length": 302.4375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.0903786994802164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.016184502048417926, + "learning_rate": 7.6596e-06, + "loss": -0.0154, + "num_tokens": 39516083.0, + "reward": 3.964545249938965, + "reward_std": 0.2005615234375, + "rewards/reward_fn/mean": 3.964545249938965, + "rewards/reward_fn/std": 0.2005615085363388, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 324.40625, + "completions/mean_terminated_length": 268.80645751953125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.0904847777659913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.01840786065440625, + "learning_rate": 7.6592e-06, + "loss": 0.3014, + "num_tokens": 39581696.0, + "reward": 3.629117012023926, + "reward_std": 0.9064881205558777, + "rewards/reward_fn/mean": 3.629117012023926, + "rewards/reward_fn/std": 0.9064880609512329, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.0905908560517662, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.02637373749166727, + "learning_rate": 7.6588e-06, + "loss": 0.1283, + "num_tokens": 39620544.0, + "reward": 2.9511616230010986, + "reward_std": 0.08586955070495605, + "rewards/reward_fn/mean": 2.9511616230010986, + "rewards/reward_fn/std": 0.08586958050727844, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1835.0, + "completions/max_terminated_length": 1835.0, + "completions/mean_length": 448.1875, + "completions/mean_terminated_length": 448.1875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.0906969343375411, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.01755295612383634, + "learning_rate": 7.6584e-06, + "loss": 0.0497, + "num_tokens": 39678278.0, + "reward": 3.100637197494507, + "reward_std": 0.4405517578125, + "rewards/reward_fn/mean": 3.100637197494507, + "rewards/reward_fn/std": 0.4405516982078552, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1361.0, + "completions/max_terminated_length": 1361.0, + "completions/mean_length": 370.0625, + "completions/mean_terminated_length": 370.0625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.09080301262331601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.02147255139425397, + "learning_rate": 7.658e-06, + "loss": 0.0513, + "num_tokens": 39723720.0, + "reward": 2.9578869342803955, + "reward_std": 0.45630425214767456, + "rewards/reward_fn/mean": 2.9578869342803955, + "rewards/reward_fn/std": 0.45630425214767456, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 160.34375, + "completions/mean_terminated_length": 160.34375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.09090909090909091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.02221720013767481, + "learning_rate": 7.6576e-06, + "loss": 0.0009, + "num_tokens": 39767827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1939.0, + "completions/mean_length": 474.875, + "completions/mean_terminated_length": 370.0000305175781, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.09101516919486581, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.02296722703613341, + "learning_rate": 7.657199999999998e-06, + "loss": 0.1096, + "num_tokens": 39814287.0, + "reward": 2.8872292041778564, + "reward_std": 0.912183940410614, + "rewards/reward_fn/mean": 2.8872292041778564, + "rewards/reward_fn/std": 0.9121840596199036, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1292.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 396.125, + "completions/mean_terminated_length": 396.125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.09112124748064071, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.019654609728604555, + "learning_rate": 7.6568e-06, + "loss": -0.0741, + "num_tokens": 39861555.0, + "reward": 3.9295010566711426, + "reward_std": 0.39880141615867615, + "rewards/reward_fn/mean": 3.9295010566711426, + "rewards/reward_fn/std": 0.39880138635635376, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 378.1875, + "completions/mean_terminated_length": 378.1875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.09122732576641561, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.023176385322585702, + "learning_rate": 7.6564e-06, + "loss": 0.029, + "num_tokens": 39908185.0, + "reward": 2.712822437286377, + "reward_std": 0.5235727429389954, + "rewards/reward_fn/mean": 2.712822437286377, + "rewards/reward_fn/std": 0.5235726833343506, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 582.9375, + "completions/mean_terminated_length": 582.9375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.09133340405219052, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.02485931245610118, + "learning_rate": 7.656e-06, + "loss": 0.0291, + "num_tokens": 39970263.0, + "reward": 2.0070159435272217, + "reward_std": 0.5709453821182251, + "rewards/reward_fn/mean": 2.0070159435272217, + "rewards/reward_fn/std": 0.5709454417228699, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 727.5, + "completions/mean_terminated_length": 684.9031982421875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.09143948233796542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.021499027032405138, + "learning_rate": 7.6556e-06, + "loss": 0.1757, + "num_tokens": 40033191.0, + "reward": 2.741589069366455, + "reward_std": 0.7133889198303223, + "rewards/reward_fn/mean": 2.741589069366455, + "rewards/reward_fn/std": 0.713388979434967, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 166.59375, + "completions/mean_terminated_length": 166.59375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.09154556062374032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.01900175167247653, + "learning_rate": 7.655199999999999e-06, + "loss": 0.0008, + "num_tokens": 40071258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 138.9375, + "completions/mean_terminated_length": 138.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.09165163890951522, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.02012345683760941, + "learning_rate": 7.6548e-06, + "loss": 0.0304, + "num_tokens": 40103192.0, + "reward": 3.9747822284698486, + "reward_std": 0.14265310764312744, + "rewards/reward_fn/mean": 3.9747822284698486, + "rewards/reward_fn/std": 0.14265307784080505, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 239.40625, + "completions/mean_terminated_length": 239.40625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.09175771719529012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.023207159945741296, + "learning_rate": 7.654399999999999e-06, + "loss": 0.0592, + "num_tokens": 40141413.0, + "reward": 3.481161117553711, + "reward_std": 0.5994711518287659, + "rewards/reward_fn/mean": 3.481161117553711, + "rewards/reward_fn/std": 0.5994711518287659, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 184.46875, + "completions/mean_terminated_length": 184.46875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.09186379548106502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.025038019753992558, + "learning_rate": 7.654e-06, + "loss": 0.001, + "num_tokens": 40193268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 258.125, + "completions/mean_terminated_length": 258.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.09196987376683993, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.019463328178972006, + "learning_rate": 7.653599999999999e-06, + "loss": 0.0152, + "num_tokens": 40232248.0, + "reward": 2.827080249786377, + "reward_std": 0.38547733426094055, + "rewards/reward_fn/mean": 2.827080249786377, + "rewards/reward_fn/std": 0.38547733426094055, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 183.78125, + "completions/mean_terminated_length": 183.78125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.09207595205261483, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.0231759175658226, + "learning_rate": 7.6532e-06, + "loss": 0.3201, + "num_tokens": 40283857.0, + "reward": 3.9275426864624023, + "reward_std": 0.40988099575042725, + "rewards/reward_fn/mean": 3.9275426864624023, + "rewards/reward_fn/std": 0.40988096594810486, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 375.5625, + "completions/mean_terminated_length": 375.5625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.09218203033838973, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.02289043739438057, + "learning_rate": 7.652799999999999e-06, + "loss": 0.0872, + "num_tokens": 40329763.0, + "reward": 3.6777396202087402, + "reward_std": 0.4553356170654297, + "rewards/reward_fn/mean": 3.6777396202087402, + "rewards/reward_fn/std": 0.4553355872631073, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1893.0, + "completions/max_terminated_length": 1893.0, + "completions/mean_length": 473.125, + "completions/mean_terminated_length": 473.125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.09228810862416463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.01598887878935784, + "learning_rate": 7.6524e-06, + "loss": 0.0179, + "num_tokens": 40403239.0, + "reward": 2.7556214332580566, + "reward_std": 0.2854246497154236, + "rewards/reward_fn/mean": 2.7556214332580566, + "rewards/reward_fn/std": 0.28542467951774597, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.09239418690993953, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.022107360186055303, + "learning_rate": 7.652e-06, + "loss": 0.0033, + "num_tokens": 40428162.0, + "reward": 3.720233201980591, + "reward_std": 0.4924321472644806, + "rewards/reward_fn/mean": 3.720233201980591, + "rewards/reward_fn/std": 0.492432177066803, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 297.59375, + "completions/mean_terminated_length": 297.59375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.09250026519571444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.018545285565778613, + "learning_rate": 7.6516e-06, + "loss": 0.0992, + "num_tokens": 40486805.0, + "reward": 3.9627604484558105, + "reward_std": 0.21065910160541534, + "rewards/reward_fn/mean": 3.9627604484558105, + "rewards/reward_fn/std": 0.21065913140773773, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 246.0625, + "completions/mean_terminated_length": 246.0625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.09260634348148934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.02543105883523822, + "learning_rate": 7.6512e-06, + "loss": -0.0205, + "num_tokens": 40540087.0, + "reward": 3.9671568870544434, + "reward_std": 0.18578803539276123, + "rewards/reward_fn/mean": 3.9671568870544434, + "rewards/reward_fn/std": 0.18578806519508362, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 224.21875, + "completions/mean_terminated_length": 224.21875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.09271242176726424, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.018198609352111816, + "learning_rate": 7.6508e-06, + "loss": 0.0684, + "num_tokens": 40569118.0, + "reward": 3.9687681198120117, + "reward_std": 0.17667338252067566, + "rewards/reward_fn/mean": 3.9687681198120117, + "rewards/reward_fn/std": 0.17667338252067566, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 536.28125, + "completions/mean_terminated_length": 536.28125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.09281850005303914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.01982128922827542, + "learning_rate": 7.6504e-06, + "loss": -0.0039, + "num_tokens": 40620615.0, + "reward": 2.6173102855682373, + "reward_std": 0.29557812213897705, + "rewards/reward_fn/mean": 2.6173102855682373, + "rewards/reward_fn/std": 0.29557812213897705, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 477.5, + "completions/mean_terminated_length": 477.5, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.09292457833881404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02005974156782031, + "learning_rate": 7.65e-06, + "loss": 0.0006, + "num_tokens": 40666359.0, + "reward": 2.715482473373413, + "reward_std": 0.26461300253868103, + "rewards/reward_fn/mean": 2.715482473373413, + "rewards/reward_fn/std": 0.2646130323410034, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 324.375, + "completions/mean_terminated_length": 324.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.09303065662458895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.023724806495010853, + "learning_rate": 7.6496e-06, + "loss": -0.0251, + "num_tokens": 40712195.0, + "reward": 2.6783430576324463, + "reward_std": 0.43827834725379944, + "rewards/reward_fn/mean": 2.6783430576324463, + "rewards/reward_fn/std": 0.43827834725379944, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 609.21875, + "completions/mean_terminated_length": 609.21875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.09313673491036385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.015918764751404524, + "learning_rate": 7.6492e-06, + "loss": 0.2026, + "num_tokens": 40765418.0, + "reward": 2.542436122894287, + "reward_std": 0.3645319640636444, + "rewards/reward_fn/mean": 2.542436122894287, + "rewards/reward_fn/std": 0.3645319640636444, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 329.6875, + "completions/mean_terminated_length": 329.6875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.09324281319613875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.020563342375680804, + "learning_rate": 7.648799999999999e-06, + "loss": -0.0037, + "num_tokens": 40812096.0, + "reward": 3.659517526626587, + "reward_std": 0.78047776222229, + "rewards/reward_fn/mean": 3.659517526626587, + "rewards/reward_fn/std": 0.7804778218269348, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1857.0, + "completions/max_terminated_length": 1857.0, + "completions/mean_length": 410.0, + "completions/mean_terminated_length": 410.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.09334889148191365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01993569522164762, + "learning_rate": 7.6484e-06, + "loss": 0.2059, + "num_tokens": 40862816.0, + "reward": 3.4595108032226562, + "reward_std": 0.9185887575149536, + "rewards/reward_fn/mean": 3.4595108032226562, + "rewards/reward_fn/std": 0.9185887575149536, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 205.53125, + "completions/mean_terminated_length": 205.53125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.09345496976768855, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.02375972643494606, + "learning_rate": 7.647999999999999e-06, + "loss": -0.0053, + "num_tokens": 40906737.0, + "reward": 3.0442795753479004, + "reward_std": 0.4683447778224945, + "rewards/reward_fn/mean": 3.0442795753479004, + "rewards/reward_fn/std": 0.4683447480201721, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 371.21875, + "completions/mean_terminated_length": 371.21875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.09356104805346345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.02078143460676074, + "learning_rate": 7.6476e-06, + "loss": 0.0293, + "num_tokens": 40958680.0, + "reward": 2.791837692260742, + "reward_std": 0.0601964108645916, + "rewards/reward_fn/mean": 2.791837692260742, + "rewards/reward_fn/std": 0.06019642949104309, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.09366712633923836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.014467753586359322, + "learning_rate": 7.6472e-06, + "loss": 0.0468, + "num_tokens": 41006028.0, + "reward": 3.193929672241211, + "reward_std": 0.0326162613928318, + "rewards/reward_fn/mean": 3.193929672241211, + "rewards/reward_fn/std": 0.03261625021696091, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 254.7096710205078, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.09377320462501326, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.026728518772870302, + "learning_rate": 7.6468e-06, + "loss": 0.3628, + "num_tokens": 41045572.0, + "reward": 3.0327024459838867, + "reward_std": 0.8356220722198486, + "rewards/reward_fn/mean": 3.0327024459838867, + "rewards/reward_fn/std": 0.8356220126152039, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.09387928291078816, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.020390488440170884, + "learning_rate": 7.6464e-06, + "loss": 0.0299, + "num_tokens": 41089652.0, + "reward": 3.96085786819458, + "reward_std": 0.15403839945793152, + "rewards/reward_fn/mean": 3.96085786819458, + "rewards/reward_fn/std": 0.1540384292602539, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1878.0, + "completions/mean_length": 977.59375, + "completions/mean_terminated_length": 943.0645141601562, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.09398536119656306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.017107663094066083, + "learning_rate": 7.646e-06, + "loss": 0.0785, + "num_tokens": 41176263.0, + "reward": 2.5095417499542236, + "reward_std": 0.5657038688659668, + "rewards/reward_fn/mean": 2.5095417499542236, + "rewards/reward_fn/std": 0.5657038688659668, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 222.34375, + "completions/mean_terminated_length": 222.34375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.09409143948233796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.01801448129117489, + "learning_rate": 7.6456e-06, + "loss": -0.0125, + "num_tokens": 41209234.0, + "reward": 2.8958828449249268, + "reward_std": 0.20669691264629364, + "rewards/reward_fn/mean": 2.8958828449249268, + "rewards/reward_fn/std": 0.20669691264629364, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 300.0, + "completions/mean_terminated_length": 300.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.09419751776811287, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.015553548815660179, + "learning_rate": 7.6452e-06, + "loss": 0.0764, + "num_tokens": 41254418.0, + "reward": 3.924589157104492, + "reward_std": 0.4265884757041931, + "rewards/reward_fn/mean": 3.924589157104492, + "rewards/reward_fn/std": 0.4265885055065155, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 157.1875, + "completions/mean_terminated_length": 157.1875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.09430359605388777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.03170281834900379, + "learning_rate": 7.6448e-06, + "loss": 0.0013, + "num_tokens": 41298296.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.09440967433966267, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.03371644695289433, + "learning_rate": 7.6444e-06, + "loss": 0.0301, + "num_tokens": 41342216.0, + "reward": 3.099053382873535, + "reward_std": 0.3049076199531555, + "rewards/reward_fn/mean": 3.099053382873535, + "rewards/reward_fn/std": 0.3049076497554779, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 184.71875, + "completions/mean_terminated_length": 184.71875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.09451575262543757, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.02772973943501711, + "learning_rate": 7.644e-06, + "loss": 0.1017, + "num_tokens": 41380127.0, + "reward": 3.9672179222106934, + "reward_std": 0.18544383347034454, + "rewards/reward_fn/mean": 3.9672179222106934, + "rewards/reward_fn/std": 0.18544386327266693, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 252.875, + "completions/mean_terminated_length": 252.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.09462183091121247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.022775678196921945, + "learning_rate": 7.643599999999999e-06, + "loss": 0.0009, + "num_tokens": 41434459.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 463.28125, + "completions/mean_terminated_length": 412.1612854003906, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.09472790919698737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.022876902716234326, + "learning_rate": 7.6432e-06, + "loss": 0.2348, + "num_tokens": 41483044.0, + "reward": 3.818485736846924, + "reward_std": 0.7323809266090393, + "rewards/reward_fn/mean": 3.818485736846924, + "rewards/reward_fn/std": 0.7323809266090393, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 417.28125, + "completions/mean_terminated_length": 417.28125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.09483398748276228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02479085512459278, + "learning_rate": 7.6428e-06, + "loss": -0.0023, + "num_tokens": 41526541.0, + "reward": 3.401907444000244, + "reward_std": 0.5439134240150452, + "rewards/reward_fn/mean": 3.401907444000244, + "rewards/reward_fn/std": 0.5439134240150452, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 137.875, + "completions/mean_terminated_length": 137.875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.09494006576853718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27734375, + "kl": 0.03966691764071584, + "learning_rate": 7.6424e-06, + "loss": 0.0016, + "num_tokens": 41574281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 280.8125, + "completions/mean_terminated_length": 280.8125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.09504614405431208, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.022205424029380083, + "learning_rate": 7.642e-06, + "loss": -0.0289, + "num_tokens": 41623075.0, + "reward": 3.148716688156128, + "reward_std": 0.39955171942710876, + "rewards/reward_fn/mean": 3.148716688156128, + "rewards/reward_fn/std": 0.3995516896247864, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 1307.0, + "completions/mean_length": 313.78125, + "completions/mean_terminated_length": 313.78125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.09515222234008698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.02836911752820015, + "learning_rate": 7.6416e-06, + "loss": 0.0118, + "num_tokens": 41669948.0, + "reward": 3.4885735511779785, + "reward_std": 0.6286227107048035, + "rewards/reward_fn/mean": 3.4885735511779785, + "rewards/reward_fn/std": 0.6286226511001587, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 356.21875, + "completions/mean_terminated_length": 356.21875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.09525830062586188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.018277009017765522, + "learning_rate": 7.6412e-06, + "loss": 0.0616, + "num_tokens": 41715267.0, + "reward": 3.8610944747924805, + "reward_std": 0.3736887574195862, + "rewards/reward_fn/mean": 3.8610944747924805, + "rewards/reward_fn/std": 0.3736887574195862, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 137.5625, + "completions/mean_terminated_length": 137.5625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0953643789116368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.023039878346025944, + "learning_rate": 7.6408e-06, + "loss": -0.0044, + "num_tokens": 41749525.0, + "reward": 2.80351185798645, + "reward_std": 0.03616482764482498, + "rewards/reward_fn/mean": 2.80351185798645, + "rewards/reward_fn/std": 0.036164846271276474, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 790.65625, + "completions/mean_terminated_length": 706.8333740234375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.09547045719741169, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.02036685636267066, + "learning_rate": 7.640399999999999e-06, + "loss": 0.1461, + "num_tokens": 41817514.0, + "reward": 1.921440839767456, + "reward_std": 0.7092944383621216, + "rewards/reward_fn/mean": 1.921440839767456, + "rewards/reward_fn/std": 0.7092943787574768, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 280.15625, + "completions/mean_terminated_length": 280.15625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.09557653548318659, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.01943918946199119, + "learning_rate": 7.64e-06, + "loss": 0.0304, + "num_tokens": 41864623.0, + "reward": 3.3483524322509766, + "reward_std": 1.0586309432983398, + "rewards/reward_fn/mean": 3.3483524322509766, + "rewards/reward_fn/std": 1.0586309432983398, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 226.78125, + "completions/mean_terminated_length": 226.78125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.09568261376896149, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.02719919686205685, + "learning_rate": 7.639599999999999e-06, + "loss": 0.0011, + "num_tokens": 41918024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 323.78125, + "completions/mean_terminated_length": 323.78125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.09578869205473639, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.028861571103334427, + "learning_rate": 7.6392e-06, + "loss": -0.0134, + "num_tokens": 41965441.0, + "reward": 2.796419143676758, + "reward_std": 0.3470582664012909, + "rewards/reward_fn/mean": 2.796419143676758, + "rewards/reward_fn/std": 0.3470582365989685, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 253.96875, + "completions/mean_terminated_length": 253.96875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0958947703405113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.022272631525993347, + "learning_rate": 7.638799999999999e-06, + "loss": -0.0688, + "num_tokens": 42010112.0, + "reward": 3.690798282623291, + "reward_std": 0.5035431981086731, + "rewards/reward_fn/mean": 3.690798282623291, + "rewards/reward_fn/std": 0.5035431981086731, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 385.0625, + "completions/mean_terminated_length": 385.0625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.0960008486262862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.031112836906686425, + "learning_rate": 7.6384e-06, + "loss": 0.0479, + "num_tokens": 42074306.0, + "reward": 2.775426149368286, + "reward_std": 0.4809582829475403, + "rewards/reward_fn/mean": 2.775426149368286, + "rewards/reward_fn/std": 0.48095831274986267, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1885.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 313.6875, + "completions/mean_terminated_length": 313.6875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.0961069269120611, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.020309182349592447, + "learning_rate": 7.638e-06, + "loss": -0.0108, + "num_tokens": 42123544.0, + "reward": 2.7922677993774414, + "reward_std": 0.33847782015800476, + "rewards/reward_fn/mean": 2.7922677993774414, + "rewards/reward_fn/std": 0.33847787976264954, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 415.65625, + "completions/mean_terminated_length": 415.65625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.096213005197836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.02549328119494021, + "learning_rate": 7.6376e-06, + "loss": 0.0944, + "num_tokens": 42171149.0, + "reward": 3.443866729736328, + "reward_std": 0.6673449277877808, + "rewards/reward_fn/mean": 3.443866729736328, + "rewards/reward_fn/std": 0.6673449277877808, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 140.0, + "completions/mean_terminated_length": 140.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.0963190834836109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.027116876328364015, + "learning_rate": 7.6372e-06, + "loss": 0.0011, + "num_tokens": 42206669.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 265.84375, + "completions/mean_terminated_length": 265.84375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0964251617693858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01707066618837416, + "learning_rate": 7.6368e-06, + "loss": 0.0234, + "num_tokens": 42253128.0, + "reward": 2.9526233673095703, + "reward_std": 0.3429475724697113, + "rewards/reward_fn/mean": 2.9526233673095703, + "rewards/reward_fn/std": 0.3429475724697113, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 121.71875, + "completions/mean_terminated_length": 121.71875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.09653124005516071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.029393920907750726, + "learning_rate": 7.6364e-06, + "loss": 0.0012, + "num_tokens": 42296863.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 290.84375, + "completions/mean_terminated_length": 290.84375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.09663731834093561, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02383925556205213, + "learning_rate": 7.636e-06, + "loss": 0.0789, + "num_tokens": 42336154.0, + "reward": 2.788799524307251, + "reward_std": 0.29157719016075134, + "rewards/reward_fn/mean": 2.788799524307251, + "rewards/reward_fn/std": 0.29157716035842896, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 230.71875, + "completions/mean_terminated_length": 230.71875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.09674339662671051, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.027955673867836595, + "learning_rate": 7.6356e-06, + "loss": -0.0334, + "num_tokens": 42378865.0, + "reward": 2.8833703994750977, + "reward_std": 0.21842359006404877, + "rewards/reward_fn/mean": 2.8833703994750977, + "rewards/reward_fn/std": 0.21842356026172638, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 169.0, + "completions/mean_terminated_length": 169.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.09684947491248541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.030652977991849184, + "learning_rate": 7.6352e-06, + "loss": 0.0012, + "num_tokens": 42412497.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 320.3125, + "completions/mean_terminated_length": 320.3125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.09695555319826031, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.027334638172760606, + "learning_rate": 7.6348e-06, + "loss": -0.0232, + "num_tokens": 42459707.0, + "reward": 3.9664275646209717, + "reward_std": 0.18991468846797943, + "rewards/reward_fn/mean": 3.9664275646209717, + "rewards/reward_fn/std": 0.18991467356681824, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.09706163148403522, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.026974455220624804, + "learning_rate": 7.6344e-06, + "loss": 0.0048, + "num_tokens": 42512107.0, + "reward": 2.9059715270996094, + "reward_std": 0.3558712303638458, + "rewards/reward_fn/mean": 2.9059715270996094, + "rewards/reward_fn/std": 0.3558712303638458, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 261.1875, + "completions/mean_terminated_length": 261.1875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.09716770976981012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.024873450631275773, + "learning_rate": 7.634e-06, + "loss": 0.0466, + "num_tokens": 42561489.0, + "reward": 3.0658812522888184, + "reward_std": 0.45738157629966736, + "rewards/reward_fn/mean": 3.0658812522888184, + "rewards/reward_fn/std": 0.45738163590431213, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 134.875, + "completions/mean_terminated_length": 134.875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.09727378805558502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.02832574676722288, + "learning_rate": 7.6336e-06, + "loss": 0.0011, + "num_tokens": 42610765.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 276.375, + "completions/mean_terminated_length": 276.375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.09737986634135992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.022886406630277634, + "learning_rate": 7.6332e-06, + "loss": -0.0003, + "num_tokens": 42658553.0, + "reward": 3.8918814659118652, + "reward_std": 0.4476149380207062, + "rewards/reward_fn/mean": 3.8918814659118652, + "rewards/reward_fn/std": 0.4476148784160614, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 377.375, + "completions/mean_terminated_length": 377.375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.09748594462713482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.014997152611613274, + "learning_rate": 7.6328e-06, + "loss": 0.0899, + "num_tokens": 42718917.0, + "reward": 3.833319664001465, + "reward_std": 0.44810566306114197, + "rewards/reward_fn/mean": 3.833319664001465, + "rewards/reward_fn/std": 0.4481056332588196, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 214.65625, + "completions/mean_terminated_length": 214.65625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.09759202291290972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.026748921489343047, + "learning_rate": 7.6324e-06, + "loss": 0.0011, + "num_tokens": 42766810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.09769810119868463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.029225841630250216, + "learning_rate": 7.631999999999999e-06, + "loss": -0.0142, + "num_tokens": 42812408.0, + "reward": 3.3031575679779053, + "reward_std": 0.5849432945251465, + "rewards/reward_fn/mean": 3.3031575679779053, + "rewards/reward_fn/std": 0.5849433541297913, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 72.1875, + "completions/mean_terminated_length": 72.1875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.09780417948445953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.228515625, + "kl": 0.01937575329793617, + "learning_rate": 7.6316e-06, + "loss": 0.0008, + "num_tokens": 42856382.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 300.40625, + "completions/mean_terminated_length": 300.40625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.09791025777023443, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.04207867290824652, + "learning_rate": 7.631199999999999e-06, + "loss": 0.0742, + "num_tokens": 42903371.0, + "reward": 3.767624855041504, + "reward_std": 0.5340158939361572, + "rewards/reward_fn/mean": 3.767624855041504, + "rewards/reward_fn/std": 0.5340158343315125, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1156.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 324.9375, + "completions/mean_terminated_length": 324.9375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.09801633605600933, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.027455645380541682, + "learning_rate": 7.6308e-06, + "loss": -0.0995, + "num_tokens": 42947977.0, + "reward": 3.0452606678009033, + "reward_std": 0.368459552526474, + "rewards/reward_fn/mean": 3.0452606678009033, + "rewards/reward_fn/std": 0.368459552526474, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 472.5, + "completions/mean_terminated_length": 421.6773986816406, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.09812241434178423, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.01985622337087989, + "learning_rate": 7.630399999999999e-06, + "loss": 0.2311, + "num_tokens": 42978681.0, + "reward": 2.82580828666687, + "reward_std": 0.7253391146659851, + "rewards/reward_fn/mean": 2.82580828666687, + "rewards/reward_fn/std": 0.7253391742706299, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 411.09375, + "completions/mean_terminated_length": 411.09375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.09822849262755914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.03032882115803659, + "learning_rate": 7.63e-06, + "loss": 0.0001, + "num_tokens": 43022940.0, + "reward": 3.6942696571350098, + "reward_std": 0.6968668699264526, + "rewards/reward_fn/mean": 3.6942696571350098, + "rewards/reward_fn/std": 0.6968669295310974, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 238.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.09833457091333404, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.035402664449065924, + "learning_rate": 7.629599999999999e-06, + "loss": 0.0078, + "num_tokens": 43059234.0, + "reward": 3.816627025604248, + "reward_std": 0.39149269461631775, + "rewards/reward_fn/mean": 3.816627025604248, + "rewards/reward_fn/std": 0.39149269461631775, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 207.5625, + "completions/mean_terminated_length": 207.5625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.09844064919910894, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.025182449258863926, + "learning_rate": 7.6292e-06, + "loss": 0.013, + "num_tokens": 43098900.0, + "reward": 3.970850944519043, + "reward_std": 0.16489259898662567, + "rewards/reward_fn/mean": 3.970850944519043, + "rewards/reward_fn/std": 0.16489258408546448, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 260.90625, + "completions/mean_terminated_length": 260.90625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.09854672748488384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.0351427448913455, + "learning_rate": 7.6288e-06, + "loss": -0.0097, + "num_tokens": 43140209.0, + "reward": 3.0380711555480957, + "reward_std": 0.37293097376823425, + "rewards/reward_fn/mean": 3.0380711555480957, + "rewards/reward_fn/std": 0.37293094396591187, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 742.28125, + "completions/mean_terminated_length": 655.2333374023438, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.09865280577065874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.019054226577281952, + "learning_rate": 7.6284e-06, + "loss": 0.31, + "num_tokens": 43201562.0, + "reward": 2.4119246006011963, + "reward_std": 0.6986344456672668, + "rewards/reward_fn/mean": 2.4119246006011963, + "rewards/reward_fn/std": 0.6986343860626221, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 144.28125, + "completions/mean_terminated_length": 144.28125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.09875888405643365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.02223593066446483, + "learning_rate": 7.628e-06, + "loss": 0.0009, + "num_tokens": 43242531.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 392.96875, + "completions/mean_terminated_length": 392.96875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.09886496234220855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.019913258031010628, + "learning_rate": 7.6276e-06, + "loss": 0.0549, + "num_tokens": 43291458.0, + "reward": 3.416562080383301, + "reward_std": 0.694709300994873, + "rewards/reward_fn/mean": 3.416562080383301, + "rewards/reward_fn/std": 0.694709300994873, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 476.09375, + "completions/mean_terminated_length": 425.3870849609375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.09897104062798345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.022749242838472128, + "learning_rate": 7.6272e-06, + "loss": 0.2832, + "num_tokens": 43340805.0, + "reward": 3.8043880462646484, + "reward_std": 0.7476766109466553, + "rewards/reward_fn/mean": 3.8043880462646484, + "rewards/reward_fn/std": 0.7476766109466553, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 560.71875, + "completions/mean_terminated_length": 461.5666809082031, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.09907711891375835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.02255812706425786, + "learning_rate": 7.6267999999999996e-06, + "loss": 0.1704, + "num_tokens": 43409628.0, + "reward": 3.572598934173584, + "reward_std": 1.1030330657958984, + "rewards/reward_fn/mean": 3.572598934173584, + "rewards/reward_fn/std": 1.1030330657958984, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 477.0, + "completions/mean_terminated_length": 477.0, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.09918319719953325, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.017629666137509048, + "learning_rate": 7.6263999999999995e-06, + "loss": -0.0552, + "num_tokens": 43443996.0, + "reward": 2.8076069355010986, + "reward_std": 0.29366716742515564, + "rewards/reward_fn/mean": 2.8076069355010986, + "rewards/reward_fn/std": 0.29366716742515564, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 305.6875, + "completions/mean_terminated_length": 305.6875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.09928927548530815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02135719486977905, + "learning_rate": 7.626e-06, + "loss": 0.0887, + "num_tokens": 43491442.0, + "reward": 3.76924991607666, + "reward_std": 0.48810887336730957, + "rewards/reward_fn/mean": 3.76924991607666, + "rewards/reward_fn/std": 0.4881088435649872, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 144.9375, + "completions/mean_terminated_length": 144.9375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.09939535377108306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.024941423209384084, + "learning_rate": 7.6256e-06, + "loss": 0.001, + "num_tokens": 43530480.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 505.1875, + "completions/mean_terminated_length": 505.1875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.09950143205685796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.0277643243316561, + "learning_rate": 7.6252e-06, + "loss": 0.0105, + "num_tokens": 43579542.0, + "reward": 2.689387321472168, + "reward_std": 0.519531786441803, + "rewards/reward_fn/mean": 2.689387321472168, + "rewards/reward_fn/std": 0.519531786441803, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 156.46875, + "completions/mean_terminated_length": 156.46875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.09960751034263286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.024052355904132128, + "learning_rate": 7.624799999999999e-06, + "loss": 0.0352, + "num_tokens": 43613061.0, + "reward": 3.1143696308135986, + "reward_std": 0.43374985456466675, + "rewards/reward_fn/mean": 3.1143696308135986, + "rewards/reward_fn/std": 0.43374985456466675, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 422.25, + "completions/mean_terminated_length": 422.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.09971358862840776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.027861488750204444, + "learning_rate": 7.624399999999999e-06, + "loss": 0.269, + "num_tokens": 43657805.0, + "reward": 2.7488107681274414, + "reward_std": 0.2731390595436096, + "rewards/reward_fn/mean": 2.7488107681274414, + "rewards/reward_fn/std": 0.27313902974128723, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 310.46875, + "completions/mean_terminated_length": 310.46875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.09981966691418266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.01709406217560172, + "learning_rate": 7.623999999999999e-06, + "loss": -0.0491, + "num_tokens": 43689948.0, + "reward": 3.244318723678589, + "reward_std": 0.6297351121902466, + "rewards/reward_fn/mean": 3.244318723678589, + "rewards/reward_fn/std": 0.6297351121902466, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 236.59375, + "completions/mean_terminated_length": 236.59375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.09992574519995757, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.029757092706859112, + "learning_rate": 7.623599999999999e-06, + "loss": -0.0306, + "num_tokens": 43729743.0, + "reward": 3.699985980987549, + "reward_std": 0.48831066489219666, + "rewards/reward_fn/mean": 3.699985980987549, + "rewards/reward_fn/std": 0.48831063508987427, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 359.71875, + "completions/mean_terminated_length": 359.71875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.10003182348573247, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0185692491941154, + "learning_rate": 7.623199999999999e-06, + "loss": 0.0006, + "num_tokens": 43781254.0, + "reward": 3.932116985321045, + "reward_std": 0.38400447368621826, + "rewards/reward_fn/mean": 3.932116985321045, + "rewards/reward_fn/std": 0.38400450348854065, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.10013790177150737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.022619884461164474, + "learning_rate": 7.622799999999999e-06, + "loss": 0.0009, + "num_tokens": 43821258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 129.46875, + "completions/mean_terminated_length": 129.46875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.10024398005728227, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.030272313859313726, + "learning_rate": 7.622399999999999e-06, + "loss": -0.0063, + "num_tokens": 43859129.0, + "reward": 3.9288439750671387, + "reward_std": 0.28039026260375977, + "rewards/reward_fn/mean": 3.9288439750671387, + "rewards/reward_fn/std": 0.2803902328014374, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 345.75, + "completions/mean_terminated_length": 345.75, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.10035005834305717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.022123834351077676, + "learning_rate": 7.621999999999999e-06, + "loss": -0.064, + "num_tokens": 43899505.0, + "reward": 2.691455364227295, + "reward_std": 0.3160874843597412, + "rewards/reward_fn/mean": 2.691455364227295, + "rewards/reward_fn/std": 0.3160874545574188, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 164.59375, + "completions/mean_terminated_length": 164.59375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.10045613662883207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.027731532929465175, + "learning_rate": 7.6216e-06, + "loss": 0.0011, + "num_tokens": 43945348.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 211.8125, + "completions/mean_terminated_length": 211.8125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.10056221491460698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0301664131693542, + "learning_rate": 7.6212e-06, + "loss": -0.0287, + "num_tokens": 43992798.0, + "reward": 3.930818796157837, + "reward_std": 0.3913477957248688, + "rewards/reward_fn/mean": 3.930818796157837, + "rewards/reward_fn/std": 0.391347736120224, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1828.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 496.125, + "completions/mean_terminated_length": 496.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.10066829320038188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.019181253854185343, + "learning_rate": 7.6208e-06, + "loss": 0.0394, + "num_tokens": 44044322.0, + "reward": 3.3273301124572754, + "reward_std": 0.7396343946456909, + "rewards/reward_fn/mean": 3.3273301124572754, + "rewards/reward_fn/std": 0.7396343946456909, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 602.75, + "completions/mean_terminated_length": 602.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.10077437148615678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.028830960392951965, + "learning_rate": 7.6204e-06, + "loss": -0.0344, + "num_tokens": 44096154.0, + "reward": 1.9313682317733765, + "reward_std": 0.4274498224258423, + "rewards/reward_fn/mean": 1.9313682317733765, + "rewards/reward_fn/std": 0.4274497926235199, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 310.3125, + "completions/mean_terminated_length": 310.3125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.10088044977193168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.025494011351838708, + "learning_rate": 7.62e-06, + "loss": 0.1634, + "num_tokens": 44178660.0, + "reward": 3.572533369064331, + "reward_std": 0.6003016233444214, + "rewards/reward_fn/mean": 3.572533369064331, + "rewards/reward_fn/std": 0.6003016829490662, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 385.1875, + "completions/mean_terminated_length": 385.1875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.10098652805770658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.021688956068828702, + "learning_rate": 7.6196e-06, + "loss": 0.0484, + "num_tokens": 44213994.0, + "reward": 3.7324137687683105, + "reward_std": 0.669641375541687, + "rewards/reward_fn/mean": 3.7324137687683105, + "rewards/reward_fn/std": 0.669641375541687, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 322.71875, + "completions/mean_terminated_length": 322.71875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.1010926063434815, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.020461396779865026, + "learning_rate": 7.6192e-06, + "loss": -0.0531, + "num_tokens": 44267777.0, + "reward": 3.1110587120056152, + "reward_std": 0.43503865599632263, + "rewards/reward_fn/mean": 3.1110587120056152, + "rewards/reward_fn/std": 0.43503862619400024, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 244.84375, + "completions/mean_terminated_length": 244.84375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1011986846292564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.021231455844826996, + "learning_rate": 7.6188e-06, + "loss": 0.0719, + "num_tokens": 44324988.0, + "reward": 3.789945602416992, + "reward_std": 0.5255440473556519, + "rewards/reward_fn/mean": 3.789945602416992, + "rewards/reward_fn/std": 0.5255439877510071, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 539.4375, + "completions/mean_terminated_length": 490.774169921875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.1013047629150313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.019856604980304837, + "learning_rate": 7.6184e-06, + "loss": 0.0295, + "num_tokens": 44379306.0, + "reward": 1.9409844875335693, + "reward_std": 0.5565598011016846, + "rewards/reward_fn/mean": 1.9409844875335693, + "rewards/reward_fn/std": 0.5565597414970398, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 499.71875, + "completions/mean_terminated_length": 449.774169921875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.10141084120080619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.025161937111988664, + "learning_rate": 7.618e-06, + "loss": 0.1219, + "num_tokens": 44439393.0, + "reward": 2.661538600921631, + "reward_std": 0.39979878067970276, + "rewards/reward_fn/mean": 2.661538600921631, + "rewards/reward_fn/std": 0.399798721075058, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 397.9375, + "completions/mean_terminated_length": 397.9375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.10151691948658109, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.027824259363114834, + "learning_rate": 7.6176e-06, + "loss": -0.0297, + "num_tokens": 44486335.0, + "reward": 2.802313804626465, + "reward_std": 0.21157807111740112, + "rewards/reward_fn/mean": 2.802313804626465, + "rewards/reward_fn/std": 0.21157805621623993, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 458.6875, + "completions/mean_terminated_length": 458.6875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.101622997772356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.01801618025638163, + "learning_rate": 7.6172e-06, + "loss": 0.0008, + "num_tokens": 44541941.0, + "reward": 2.776561737060547, + "reward_std": 0.20328201353549957, + "rewards/reward_fn/mean": 2.776561737060547, + "rewards/reward_fn/std": 0.20328204333782196, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1326.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 260.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1017290760581309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.03001307207159698, + "learning_rate": 7.6168e-06, + "loss": -0.0721, + "num_tokens": 44599129.0, + "reward": 3.648618221282959, + "reward_std": 0.493778258562088, + "rewards/reward_fn/mean": 3.648618221282959, + "rewards/reward_fn/std": 0.4937782883644104, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 149.3125, + "completions/mean_terminated_length": 149.3125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1018351543439058, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.03182677808217704, + "learning_rate": 7.6164e-06, + "loss": -0.0079, + "num_tokens": 44627427.0, + "reward": 3.9289231300354004, + "reward_std": 0.4020720422267914, + "rewards/reward_fn/mean": 3.9289231300354004, + "rewards/reward_fn/std": 0.4020719826221466, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 195.65625, + "completions/mean_terminated_length": 195.65625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.1019412326296807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.025319629814475775, + "learning_rate": 7.616e-06, + "loss": 0.001, + "num_tokens": 44652216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.1020473109154556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.03323507239110768, + "learning_rate": 7.6155999999999996e-06, + "loss": 0.1249, + "num_tokens": 44690076.0, + "reward": 2.9434142112731934, + "reward_std": 0.04483083263039589, + "rewards/reward_fn/mean": 2.9434142112731934, + "rewards/reward_fn/std": 0.044830840080976486, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.1021533892012305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.04052834562025964, + "learning_rate": 7.6151999999999995e-06, + "loss": 0.0596, + "num_tokens": 44742006.0, + "reward": 3.814697265625, + "reward_std": 0.4385845363140106, + "rewards/reward_fn/mean": 3.814697265625, + "rewards/reward_fn/std": 0.43858450651168823, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 337.90625, + "completions/mean_terminated_length": 337.90625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.10225946748700541, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.027457768563181162, + "learning_rate": 7.6147999999999995e-06, + "loss": 0.0372, + "num_tokens": 44779315.0, + "reward": 3.6313183307647705, + "reward_std": 0.5176029205322266, + "rewards/reward_fn/mean": 3.6313183307647705, + "rewards/reward_fn/std": 0.5176029801368713, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.10236554577278031, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.023238977417349815, + "learning_rate": 7.6143999999999995e-06, + "loss": 0.0578, + "num_tokens": 44820835.0, + "reward": 2.8203535079956055, + "reward_std": 0.028799260035157204, + "rewards/reward_fn/mean": 2.8203535079956055, + "rewards/reward_fn/std": 0.028799280524253845, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 591.9375, + "completions/mean_terminated_length": 544.9677124023438, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.10247162405855521, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.018361143651418388, + "learning_rate": 7.6139999999999994e-06, + "loss": 0.212, + "num_tokens": 44876033.0, + "reward": 2.5604774951934814, + "reward_std": 0.6667385101318359, + "rewards/reward_fn/mean": 2.5604774951934814, + "rewards/reward_fn/std": 0.6667385101318359, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 167.6875, + "completions/mean_terminated_length": 167.6875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.10257770234433011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.028179120272397995, + "learning_rate": 7.613599999999999e-06, + "loss": 0.0011, + "num_tokens": 44924119.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 346.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.10268378063010501, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.027506219688802958, + "learning_rate": 7.613199999999999e-06, + "loss": 0.0885, + "num_tokens": 44979431.0, + "reward": 2.5675511360168457, + "reward_std": 0.48671212792396545, + "rewards/reward_fn/mean": 2.5675511360168457, + "rewards/reward_fn/std": 0.48671212792396545, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 320.90625, + "completions/mean_terminated_length": 320.90625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.10278985891587993, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02348879328928888, + "learning_rate": 7.612799999999999e-06, + "loss": 0.0561, + "num_tokens": 45025924.0, + "reward": 3.9348433017730713, + "reward_std": 0.25716766715049744, + "rewards/reward_fn/mean": 3.9348433017730713, + "rewards/reward_fn/std": 0.25716766715049744, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 348.03125, + "completions/mean_terminated_length": 293.19354248046875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.10289593720165482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02728769346140325, + "learning_rate": 7.612399999999999e-06, + "loss": 0.299, + "num_tokens": 45080165.0, + "reward": 3.480499744415283, + "reward_std": 0.8933743834495544, + "rewards/reward_fn/mean": 3.480499744415283, + "rewards/reward_fn/std": 0.8933743834495544, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.10300201548742972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.027561142342165112, + "learning_rate": 7.612e-06, + "loss": -0.0759, + "num_tokens": 45130973.0, + "reward": 3.857459783554077, + "reward_std": 0.33770760893821716, + "rewards/reward_fn/mean": 3.857459783554077, + "rewards/reward_fn/std": 0.3377075791358948, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 299.78125, + "completions/mean_terminated_length": 299.78125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.10310809377320462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.02726740762591362, + "learning_rate": 7.6116e-06, + "loss": -0.0207, + "num_tokens": 45169302.0, + "reward": 2.7473816871643066, + "reward_std": 0.1762368530035019, + "rewards/reward_fn/mean": 2.7473816871643066, + "rewards/reward_fn/std": 0.1762368530035019, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 101.09375, + "completions/mean_terminated_length": 101.09375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.10321417205897952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.033008648082613945, + "learning_rate": 7.6112e-06, + "loss": 0.0013, + "num_tokens": 45197241.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 285.46875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.10332025034475444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.02396201086230576, + "learning_rate": 7.6108e-06, + "loss": 0.0925, + "num_tokens": 45240744.0, + "reward": 1.7027685642242432, + "reward_std": 0.03398967534303665, + "rewards/reward_fn/mean": 1.7027685642242432, + "rewards/reward_fn/std": 0.03398967534303665, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.10342632863052934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.031388872768729925, + "learning_rate": 7.6104e-06, + "loss": 0.0013, + "num_tokens": 45280020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 226.59375, + "completions/mean_terminated_length": 226.59375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.10353240691630423, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.03019576147198677, + "learning_rate": 7.61e-06, + "loss": -0.0612, + "num_tokens": 45328007.0, + "reward": 3.906989336013794, + "reward_std": 0.30038613080978394, + "rewards/reward_fn/mean": 3.906989336013794, + "rewards/reward_fn/std": 0.3003861606121063, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 299.59375, + "completions/mean_terminated_length": 299.59375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.10363848520207913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.024021990364417434, + "learning_rate": 7.6096e-06, + "loss": -0.0009, + "num_tokens": 45371802.0, + "reward": 2.7480766773223877, + "reward_std": 0.044364336878061295, + "rewards/reward_fn/mean": 2.7480766773223877, + "rewards/reward_fn/std": 0.0443643182516098, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 388.46875, + "completions/mean_terminated_length": 334.93548583984375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.10374456348785403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.021239728201180696, + "learning_rate": 7.6092e-06, + "loss": 0.2854, + "num_tokens": 45415017.0, + "reward": 3.8120007514953613, + "reward_std": 0.7408618330955505, + "rewards/reward_fn/mean": 3.8120007514953613, + "rewards/reward_fn/std": 0.7408618330955505, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 197.78125, + "completions/mean_terminated_length": 197.78125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.10385064177362893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.02474969206377864, + "learning_rate": 7.608799999999999e-06, + "loss": 0.001, + "num_tokens": 45455330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 413.9375, + "completions/mean_terminated_length": 413.9375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.10395672005940385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.01774134172592312, + "learning_rate": 7.608399999999999e-06, + "loss": 0.0342, + "num_tokens": 45504768.0, + "reward": 2.7947754859924316, + "reward_std": 0.048552006483078, + "rewards/reward_fn/mean": 2.7947754859924316, + "rewards/reward_fn/std": 0.048552028834819794, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 203.03125, + "completions/mean_terminated_length": 203.03125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.10406279834517874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.025733540067449212, + "learning_rate": 7.607999999999999e-06, + "loss": 0.001, + "num_tokens": 45550593.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 222.5625, + "completions/mean_terminated_length": 222.5625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.10416887663095364, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.023094360250979662, + "learning_rate": 7.607599999999999e-06, + "loss": 0.0127, + "num_tokens": 45598131.0, + "reward": 2.8349218368530273, + "reward_std": 0.058446187525987625, + "rewards/reward_fn/mean": 2.8349218368530273, + "rewards/reward_fn/std": 0.05844619497656822, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 378.90625, + "completions/mean_terminated_length": 378.90625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.10427495491672854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.028874794021248817, + "learning_rate": 7.6072e-06, + "loss": -0.044, + "num_tokens": 45648208.0, + "reward": 3.8350107669830322, + "reward_std": 0.39003050327301025, + "rewards/reward_fn/mean": 3.8350107669830322, + "rewards/reward_fn/std": 0.39003047347068787, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 232.03125, + "completions/mean_terminated_length": 232.03125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.10438103320250344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0243125488050282, + "learning_rate": 7.6068e-06, + "loss": 0.0304, + "num_tokens": 45700241.0, + "reward": 3.8992538452148438, + "reward_std": 0.3183940649032593, + "rewards/reward_fn/mean": 3.8992538452148438, + "rewards/reward_fn/std": 0.3183940649032593, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 215.53125, + "completions/mean_terminated_length": 215.53125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.10448711148827836, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.05102335708215833, + "learning_rate": 7.6064e-06, + "loss": -0.0175, + "num_tokens": 45759362.0, + "reward": 3.928581476211548, + "reward_std": 0.40400430560112, + "rewards/reward_fn/mean": 3.928581476211548, + "rewards/reward_fn/std": 0.40400430560112, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 509.96875, + "completions/mean_terminated_length": 509.96875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.10459318977405326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.018654248444363475, + "learning_rate": 7.606e-06, + "loss": -0.0156, + "num_tokens": 45806657.0, + "reward": 3.7861359119415283, + "reward_std": 0.6757104992866516, + "rewards/reward_fn/mean": 3.7861359119415283, + "rewards/reward_fn/std": 0.6757104396820068, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 225.625, + "completions/mean_terminated_length": 225.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.10469926805982815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.019475288689136505, + "learning_rate": 7.6056e-06, + "loss": -0.0837, + "num_tokens": 45853589.0, + "reward": 2.9933929443359375, + "reward_std": 0.440521240234375, + "rewards/reward_fn/mean": 2.9933929443359375, + "rewards/reward_fn/std": 0.440521240234375, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 220.96875, + "completions/mean_terminated_length": 220.96875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.10480534634560305, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.02420557360164821, + "learning_rate": 7.6052e-06, + "loss": 0.0416, + "num_tokens": 45912820.0, + "reward": 3.9028656482696533, + "reward_std": 0.3068977892398834, + "rewards/reward_fn/mean": 3.9028656482696533, + "rewards/reward_fn/std": 0.3068977892398834, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 293.875, + "completions/mean_terminated_length": 293.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.10491142463137795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0265792990103364, + "learning_rate": 7.6048e-06, + "loss": 0.0531, + "num_tokens": 45961456.0, + "reward": 3.8468708992004395, + "reward_std": 0.41194701194763184, + "rewards/reward_fn/mean": 3.8468708992004395, + "rewards/reward_fn/std": 0.41194698214530945, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 284.53125, + "completions/mean_terminated_length": 284.53125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.10501750291715285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.014853294123895466, + "learning_rate": 7.6043999999999996e-06, + "loss": -0.0016, + "num_tokens": 46004897.0, + "reward": 3.556659460067749, + "reward_std": 0.4804559051990509, + "rewards/reward_fn/mean": 3.556659460067749, + "rewards/reward_fn/std": 0.4804559350013733, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 148.71875, + "completions/mean_terminated_length": 148.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.10512358120292777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.01824998517986387, + "learning_rate": 7.6039999999999995e-06, + "loss": 0.1187, + "num_tokens": 46042200.0, + "reward": 3.964564561843872, + "reward_std": 0.2004532665014267, + "rewards/reward_fn/mean": 3.964564561843872, + "rewards/reward_fn/std": 0.2004532814025879, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.10522965948870266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.020736161852255464, + "learning_rate": 7.6035999999999995e-06, + "loss": -0.0034, + "num_tokens": 46085484.0, + "reward": 3.8878173828125, + "reward_std": 0.3545609414577484, + "rewards/reward_fn/mean": 3.8878173828125, + "rewards/reward_fn/std": 0.3545609414577484, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 276.5625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.10533573777447756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.019082832615822554, + "learning_rate": 7.6031999999999995e-06, + "loss": 0.0008, + "num_tokens": 46130718.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 79.4375, + "completions/mean_terminated_length": 79.4375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.10544181606025246, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.02028225746471435, + "learning_rate": 7.6028e-06, + "loss": 0.0617, + "num_tokens": 46156524.0, + "reward": 3.9314725399017334, + "reward_std": 0.3876495361328125, + "rewards/reward_fn/mean": 3.9314725399017334, + "rewards/reward_fn/std": 0.3876495659351349, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 261.5625, + "completions/mean_terminated_length": 261.5625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.10554789434602736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.02337950048968196, + "learning_rate": 7.6024e-06, + "loss": 0.0009, + "num_tokens": 46201086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 177.34375, + "completions/mean_terminated_length": 177.34375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.10565397263180228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.023212826810777187, + "learning_rate": 7.602e-06, + "loss": 0.0009, + "num_tokens": 46232425.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 310.28125, + "completions/mean_terminated_length": 310.28125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.10576005091757718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.028905097395181656, + "learning_rate": 7.6016e-06, + "loss": 0.0946, + "num_tokens": 46271346.0, + "reward": 2.8112916946411133, + "reward_std": 1.1012606620788574, + "rewards/reward_fn/mean": 2.8112916946411133, + "rewards/reward_fn/std": 1.1012605428695679, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 164.28125, + "completions/mean_terminated_length": 164.28125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.10586612920335207, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.023424757411703467, + "learning_rate": 7.6012e-06, + "loss": 0.047, + "num_tokens": 46305499.0, + "reward": 3.071727991104126, + "reward_std": 0.04227209836244583, + "rewards/reward_fn/mean": 3.071727991104126, + "rewards/reward_fn/std": 0.042272068560123444, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 215.46875, + "completions/mean_terminated_length": 215.46875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.10597220748912697, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.020644933450967073, + "learning_rate": 7.600799999999999e-06, + "loss": 0.0149, + "num_tokens": 46356042.0, + "reward": 3.8897316455841064, + "reward_std": 0.4581899642944336, + "rewards/reward_fn/mean": 3.8897316455841064, + "rewards/reward_fn/std": 0.458189994096756, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 102.78125, + "completions/mean_terminated_length": 102.78125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.10607828577490187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.016965405957307667, + "learning_rate": 7.600399999999999e-06, + "loss": 0.0007, + "num_tokens": 46400163.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 167.6875, + "completions/mean_terminated_length": 167.6875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.10618436406067679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.02575285453349352, + "learning_rate": 7.599999999999999e-06, + "loss": 0.001, + "num_tokens": 46440729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 316.84375, + "completions/mean_terminated_length": 316.84375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.10629044234645169, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.0235441483091563, + "learning_rate": 7.599599999999999e-06, + "loss": 0.0518, + "num_tokens": 46485972.0, + "reward": 2.7349283695220947, + "reward_std": 0.3006006181240082, + "rewards/reward_fn/mean": 2.7349283695220947, + "rewards/reward_fn/std": 0.3006005883216858, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 311.46875, + "completions/mean_terminated_length": 311.46875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.10639652063222658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.019770273473113775, + "learning_rate": 7.599199999999999e-06, + "loss": 0.0655, + "num_tokens": 46531491.0, + "reward": 3.046574831008911, + "reward_std": 0.034899428486824036, + "rewards/reward_fn/mean": 3.046574831008911, + "rewards/reward_fn/std": 0.034899454563856125, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1233.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 219.84375, + "completions/mean_terminated_length": 219.84375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.10650259891800148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.022727035451680422, + "learning_rate": 7.598799999999999e-06, + "loss": 0.26, + "num_tokens": 46583710.0, + "reward": 2.8298349380493164, + "reward_std": 0.035851918160915375, + "rewards/reward_fn/mean": 2.8298349380493164, + "rewards/reward_fn/std": 0.03585192188620567, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 469.0625, + "completions/mean_terminated_length": 469.0625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.10660867720377638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.017710814368911088, + "learning_rate": 7.598399999999999e-06, + "loss": -0.0522, + "num_tokens": 46648288.0, + "reward": 3.8750860691070557, + "reward_std": 0.3370124101638794, + "rewards/reward_fn/mean": 3.8750860691070557, + "rewards/reward_fn/std": 0.337012380361557, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2024.0, + "completions/max_terminated_length": 2024.0, + "completions/mean_length": 510.9375, + "completions/mean_terminated_length": 510.9375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.10671475548955128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.018817924661561847, + "learning_rate": 7.598e-06, + "loss": 0.0941, + "num_tokens": 46701182.0, + "reward": 3.7116811275482178, + "reward_std": 0.6552125811576843, + "rewards/reward_fn/mean": 3.7116811275482178, + "rewards/reward_fn/std": 0.6552125215530396, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 590.78125, + "completions/mean_terminated_length": 590.78125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.1068208337753262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.020602114964276552, + "learning_rate": 7.5976e-06, + "loss": -0.0235, + "num_tokens": 46758519.0, + "reward": 2.503760814666748, + "reward_std": 0.6346949338912964, + "rewards/reward_fn/mean": 2.503760814666748, + "rewards/reward_fn/std": 0.6346949338912964, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 277.1875, + "completions/mean_terminated_length": 277.1875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.1069269120611011, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.018794673145748675, + "learning_rate": 7.5972e-06, + "loss": 0.0631, + "num_tokens": 46811837.0, + "reward": 2.8126931190490723, + "reward_std": 0.046026017516851425, + "rewards/reward_fn/mean": 2.8126931190490723, + "rewards/reward_fn/std": 0.04602604731917381, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 370.4375, + "completions/mean_terminated_length": 316.32257080078125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.107032990346876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.02782504353672266, + "learning_rate": 7.5968e-06, + "loss": 0.3013, + "num_tokens": 46857707.0, + "reward": 3.1419944763183594, + "reward_std": 0.6233690977096558, + "rewards/reward_fn/mean": 3.1419944763183594, + "rewards/reward_fn/std": 0.6233690977096558, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 101.96875, + "completions/mean_terminated_length": 101.96875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.1071390686326509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.022366830613464117, + "learning_rate": 7.5964e-06, + "loss": 0.0009, + "num_tokens": 46896874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 425.25, + "completions/mean_terminated_length": 425.25, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.1072451469184258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.025819960748776793, + "learning_rate": 7.596e-06, + "loss": 0.0277, + "num_tokens": 46952274.0, + "reward": 2.5166473388671875, + "reward_std": 0.42934849858283997, + "rewards/reward_fn/mean": 2.5166473388671875, + "rewards/reward_fn/std": 0.42934852838516235, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 967.0, + "completions/mean_terminated_length": 894.933349609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.1073512252042007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.01642954268027097, + "learning_rate": 7.5956e-06, + "loss": 0.1213, + "num_tokens": 47010386.0, + "reward": 2.331674337387085, + "reward_std": 0.7580024003982544, + "rewards/reward_fn/mean": 2.331674337387085, + "rewards/reward_fn/std": 0.7580023407936096, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 177.8125, + "completions/mean_terminated_length": 177.8125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.1074573034899756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.021480249939486384, + "learning_rate": 7.5952e-06, + "loss": 0.0009, + "num_tokens": 47050796.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 278.1875, + "completions/mean_terminated_length": 278.1875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.1075633817757505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.028301968472078443, + "learning_rate": 7.5948e-06, + "loss": 0.0219, + "num_tokens": 47100498.0, + "reward": 3.1440305709838867, + "reward_std": 0.5901278853416443, + "rewards/reward_fn/mean": 3.1440305709838867, + "rewards/reward_fn/std": 0.5901278257369995, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 220.96875, + "completions/mean_terminated_length": 220.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1076694600615254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.025707697961479425, + "learning_rate": 7.5944e-06, + "loss": 0.001, + "num_tokens": 47161393.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 632.0625, + "completions/mean_terminated_length": 586.3870849609375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.1077755383473003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.02113516186363995, + "learning_rate": 7.594e-06, + "loss": 0.2314, + "num_tokens": 47223315.0, + "reward": 2.526656150817871, + "reward_std": 0.6006430983543396, + "rewards/reward_fn/mean": 2.526656150817871, + "rewards/reward_fn/std": 0.6006431579589844, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 676.65625, + "completions/mean_terminated_length": 676.65625, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.1078816166330752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.01965883933007717, + "learning_rate": 7.5936e-06, + "loss": -0.0026, + "num_tokens": 47279528.0, + "reward": 2.664598226547241, + "reward_std": 0.19238144159317017, + "rewards/reward_fn/mean": 2.664598226547241, + "rewards/reward_fn/std": 0.19238145649433136, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 166.8125, + "completions/mean_terminated_length": 166.8125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.10798769491885012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.02208179165609181, + "learning_rate": 7.5932e-06, + "loss": 0.0009, + "num_tokens": 47334466.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 287.625, + "completions/mean_terminated_length": 287.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.10809377320462502, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.030248204711824656, + "learning_rate": 7.5928e-06, + "loss": 0.1147, + "num_tokens": 47382646.0, + "reward": 3.9577441215515137, + "reward_std": 0.23903484642505646, + "rewards/reward_fn/mean": 3.9577441215515137, + "rewards/reward_fn/std": 0.23903487622737885, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.10819985149039991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.02753225597552955, + "learning_rate": 7.5923999999999995e-06, + "loss": -0.1361, + "num_tokens": 47424050.0, + "reward": 3.9119248390197754, + "reward_std": 0.27822986245155334, + "rewards/reward_fn/mean": 3.9119248390197754, + "rewards/reward_fn/std": 0.27822983264923096, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 445.1875, + "completions/mean_terminated_length": 393.4838562011719, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.10830592977617481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.025169007247313857, + "learning_rate": 7.5919999999999995e-06, + "loss": 0.2699, + "num_tokens": 47475288.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 432.5, + "completions/mean_terminated_length": 432.5, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.10841200806194971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.020216177916154265, + "learning_rate": 7.5915999999999994e-06, + "loss": 0.0523, + "num_tokens": 47528808.0, + "reward": 2.7419400215148926, + "reward_std": 0.19625967741012573, + "rewards/reward_fn/mean": 2.7419400215148926, + "rewards/reward_fn/std": 0.19625964760780334, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 570.21875, + "completions/mean_terminated_length": 570.21875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.10851808634772463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.024952333886176348, + "learning_rate": 7.591199999999999e-06, + "loss": 0.0449, + "num_tokens": 47584559.0, + "reward": 2.8316707611083984, + "reward_std": 0.03383928909897804, + "rewards/reward_fn/mean": 2.8316707611083984, + "rewards/reward_fn/std": 0.033839285373687744, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 245.625, + "completions/mean_terminated_length": 245.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.10862416463349953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.03429146436974406, + "learning_rate": 7.590799999999999e-06, + "loss": 0.0576, + "num_tokens": 47635491.0, + "reward": 3.8389077186584473, + "reward_std": 0.38074901700019836, + "rewards/reward_fn/mean": 3.8389077186584473, + "rewards/reward_fn/std": 0.38074901700019836, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 473.84375, + "completions/mean_terminated_length": 473.84375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.10873024291927443, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.022358035668730736, + "learning_rate": 7.590399999999999e-06, + "loss": 0.04, + "num_tokens": 47704702.0, + "reward": 2.8870303630828857, + "reward_std": 0.09213743358850479, + "rewards/reward_fn/mean": 2.8870303630828857, + "rewards/reward_fn/std": 0.09213750809431076, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 356.9375, + "completions/mean_terminated_length": 356.9375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.10883632120504932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.02047534054145217, + "learning_rate": 7.589999999999999e-06, + "loss": 0.1106, + "num_tokens": 47758972.0, + "reward": 3.618471145629883, + "reward_std": 0.714113712310791, + "rewards/reward_fn/mean": 3.618471145629883, + "rewards/reward_fn/std": 0.714113712310791, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 190.65625, + "completions/mean_terminated_length": 190.65625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.10894239949082422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.05442382441833615, + "learning_rate": 7.589599999999999e-06, + "loss": 0.0022, + "num_tokens": 47801393.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 286.5, + "completions/mean_terminated_length": 286.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.10904847777659914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.022991040954366326, + "learning_rate": 7.589199999999999e-06, + "loss": 0.0042, + "num_tokens": 47844385.0, + "reward": 2.956698179244995, + "reward_std": 0.3429322838783264, + "rewards/reward_fn/mean": 2.956698179244995, + "rewards/reward_fn/std": 0.34293225407600403, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 121.875, + "completions/mean_terminated_length": 121.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.10915455606237404, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.030354263726621866, + "learning_rate": 7.588799999999999e-06, + "loss": 0.1608, + "num_tokens": 47876573.0, + "reward": 3.9108946323394775, + "reward_std": 0.2824559509754181, + "rewards/reward_fn/mean": 3.9108946323394775, + "rewards/reward_fn/std": 0.2824559211730957, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 178.28125, + "completions/mean_terminated_length": 178.28125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.10926063434814894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.02428922988474369, + "learning_rate": 7.5884e-06, + "loss": 0.0219, + "num_tokens": 47910662.0, + "reward": 3.896930456161499, + "reward_std": 0.3276534676551819, + "rewards/reward_fn/mean": 3.896930456161499, + "rewards/reward_fn/std": 0.3276534676551819, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2019.0, + "completions/max_terminated_length": 2019.0, + "completions/mean_length": 508.96875, + "completions/mean_terminated_length": 508.96875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.10936671263392383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.02369822352193296, + "learning_rate": 7.588e-06, + "loss": 0.0164, + "num_tokens": 47948101.0, + "reward": 3.1501495838165283, + "reward_std": 0.7063373923301697, + "rewards/reward_fn/mean": 3.1501495838165283, + "rewards/reward_fn/std": 0.7063372731208801, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 404.75, + "completions/mean_terminated_length": 351.7419128417969, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.10947279091969873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.025314223021268845, + "learning_rate": 7.5876e-06, + "loss": 0.2811, + "num_tokens": 47982717.0, + "reward": 3.2338669300079346, + "reward_std": 0.8930133581161499, + "rewards/reward_fn/mean": 3.2338669300079346, + "rewards/reward_fn/std": 0.8930133581161499, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 240.5625, + "completions/mean_terminated_length": 240.5625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.10957886920547363, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.023625839967280626, + "learning_rate": 7.5872e-06, + "loss": -0.0313, + "num_tokens": 48036559.0, + "reward": 3.2713961601257324, + "reward_std": 0.5741091370582581, + "rewards/reward_fn/mean": 3.2713961601257324, + "rewards/reward_fn/std": 0.5741091966629028, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 434.53125, + "completions/mean_terminated_length": 434.53125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.10968494749124855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.021874976810067892, + "learning_rate": 7.5868e-06, + "loss": -0.033, + "num_tokens": 48087968.0, + "reward": 2.664581298828125, + "reward_std": 0.03544781729578972, + "rewards/reward_fn/mean": 2.664581298828125, + "rewards/reward_fn/std": 0.03544781729578972, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 165.59375, + "completions/mean_terminated_length": 165.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.10979102577702345, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.024668074445798993, + "learning_rate": 7.5864e-06, + "loss": -0.0904, + "num_tokens": 48128627.0, + "reward": 2.9213130474090576, + "reward_std": 0.2021249383687973, + "rewards/reward_fn/mean": 2.9213130474090576, + "rewards/reward_fn/std": 0.2021249383687973, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 583.875, + "completions/mean_terminated_length": 583.875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.10989710406279835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.017253244761377573, + "learning_rate": 7.586e-06, + "loss": -0.0018, + "num_tokens": 48182063.0, + "reward": 3.2175590991973877, + "reward_std": 0.9283421635627747, + "rewards/reward_fn/mean": 3.2175590991973877, + "rewards/reward_fn/std": 0.9283421635627747, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.11000318234857324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.022420917404815555, + "learning_rate": 7.5856e-06, + "loss": 0.0009, + "num_tokens": 48225567.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 394.40625, + "completions/mean_terminated_length": 394.40625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.11010926063434814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.01980700553394854, + "learning_rate": 7.5852e-06, + "loss": 0.0774, + "num_tokens": 48293036.0, + "reward": 2.6475255489349365, + "reward_std": 0.2643485963344574, + "rewards/reward_fn/mean": 2.6475255489349365, + "rewards/reward_fn/std": 0.2643485963344574, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 287.40625, + "completions/mean_terminated_length": 287.40625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.11021533892012306, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.026065111625939608, + "learning_rate": 7.5848e-06, + "loss": 0.0832, + "num_tokens": 48346649.0, + "reward": 3.965364456176758, + "reward_std": 0.19592823088169098, + "rewards/reward_fn/mean": 3.965364456176758, + "rewards/reward_fn/std": 0.19592821598052979, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.11032141720589796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.021192287211306393, + "learning_rate": 7.584399999999999e-06, + "loss": 0.0008, + "num_tokens": 48383221.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 174.34375, + "completions/mean_terminated_length": 174.34375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.11042749549167286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.021020429907366633, + "learning_rate": 7.583999999999999e-06, + "loss": 0.0779, + "num_tokens": 48431456.0, + "reward": 3.859943151473999, + "reward_std": 0.3045395612716675, + "rewards/reward_fn/mean": 3.859943151473999, + "rewards/reward_fn/std": 0.30453959107398987, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 286.65625, + "completions/mean_terminated_length": 286.65625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.11053357377744776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.029660561122000217, + "learning_rate": 7.5836e-06, + "loss": 0.1015, + "num_tokens": 48497877.0, + "reward": 3.7863168716430664, + "reward_std": 0.6171298027038574, + "rewards/reward_fn/mean": 3.7863168716430664, + "rewards/reward_fn/std": 0.6171298027038574, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 205.34375, + "completions/mean_terminated_length": 205.34375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.11063965206322265, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.033476054668426514, + "learning_rate": 7.5832e-06, + "loss": -0.0209, + "num_tokens": 48536768.0, + "reward": 3.891145706176758, + "reward_std": 0.3448222577571869, + "rewards/reward_fn/mean": 3.891145706176758, + "rewards/reward_fn/std": 0.3448222279548645, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 546.5625, + "completions/mean_terminated_length": 546.5625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.11074573034899755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.019996803253889084, + "learning_rate": 7.5828e-06, + "loss": -0.0001, + "num_tokens": 48590354.0, + "reward": 2.7087881565093994, + "reward_std": 0.3276941478252411, + "rewards/reward_fn/mean": 2.7087881565093994, + "rewards/reward_fn/std": 0.3276940882205963, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 487.53125, + "completions/mean_terminated_length": 487.53125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.11085180863477247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.02068508369848132, + "learning_rate": 7.5824e-06, + "loss": 0.0008, + "num_tokens": 48642499.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 200.09375, + "completions/mean_terminated_length": 200.09375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.11095788692054737, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.043467214331030846, + "learning_rate": 7.5819999999999996e-06, + "loss": -0.021, + "num_tokens": 48691526.0, + "reward": 3.014760971069336, + "reward_std": 0.32387152314186096, + "rewards/reward_fn/mean": 3.014760971069336, + "rewards/reward_fn/std": 0.32387155294418335, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 147.09375, + "completions/mean_terminated_length": 147.09375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.11106396520632227, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.015362325706519186, + "learning_rate": 7.5815999999999995e-06, + "loss": 0.0156, + "num_tokens": 48732105.0, + "reward": 3.898536205291748, + "reward_std": 0.4298121929168701, + "rewards/reward_fn/mean": 3.898536205291748, + "rewards/reward_fn/std": 0.42981216311454773, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 256.1875, + "completions/mean_terminated_length": 256.1875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.11117004349209716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.02533377055078745, + "learning_rate": 7.5811999999999995e-06, + "loss": 0.0145, + "num_tokens": 48787151.0, + "reward": 1.7609915733337402, + "reward_std": 0.20412206649780273, + "rewards/reward_fn/mean": 1.7609915733337402, + "rewards/reward_fn/std": 0.20412209630012512, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 237.09375, + "completions/mean_terminated_length": 237.09375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.11127612177787206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.02054914110340178, + "learning_rate": 7.5807999999999995e-06, + "loss": 0.0838, + "num_tokens": 48841618.0, + "reward": 3.9238462448120117, + "reward_std": 0.2997784912586212, + "rewards/reward_fn/mean": 3.9238462448120117, + "rewards/reward_fn/std": 0.29977843165397644, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 245.75, + "completions/mean_terminated_length": 245.75, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.11138220006364698, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.021705527789890766, + "learning_rate": 7.5803999999999995e-06, + "loss": 0.0682, + "num_tokens": 48871338.0, + "reward": 3.700695037841797, + "reward_std": 0.5276238918304443, + "rewards/reward_fn/mean": 3.700695037841797, + "rewards/reward_fn/std": 0.5276238918304443, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 485.65625, + "completions/mean_terminated_length": 485.65625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.11148827834942188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.018874148838222027, + "learning_rate": 7.5799999999999994e-06, + "loss": 0.0752, + "num_tokens": 48906847.0, + "reward": 3.036792278289795, + "reward_std": 0.6571252346038818, + "rewards/reward_fn/mean": 3.036792278289795, + "rewards/reward_fn/std": 0.6571252942085266, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 537.78125, + "completions/mean_terminated_length": 489.06451416015625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.11159435663519678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.02108449419029057, + "learning_rate": 7.579599999999999e-06, + "loss": 0.2119, + "num_tokens": 48944184.0, + "reward": 2.643126964569092, + "reward_std": 0.8047422170639038, + "rewards/reward_fn/mean": 2.643126964569092, + "rewards/reward_fn/std": 0.8047422170639038, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 112.3125, + "completions/mean_terminated_length": 112.3125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.11170043492097168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.02645600028336048, + "learning_rate": 7.5792e-06, + "loss": 0.0011, + "num_tokens": 48984898.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 215.65625, + "completions/mean_terminated_length": 215.65625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.11180651320674657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.020799231133423746, + "learning_rate": 7.5788e-06, + "loss": 0.0008, + "num_tokens": 49017527.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 148.71875, + "completions/mean_terminated_length": 148.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.11191259149252149, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.02697490993887186, + "learning_rate": 7.5784e-06, + "loss": 0.0291, + "num_tokens": 49061166.0, + "reward": 3.6497604846954346, + "reward_std": 0.5690338611602783, + "rewards/reward_fn/mean": 3.6497604846954346, + "rewards/reward_fn/std": 0.5690338611602783, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 248.40625, + "completions/mean_terminated_length": 248.40625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.11201866977829639, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.028018145821988583, + "learning_rate": 7.578e-06, + "loss": 0.187, + "num_tokens": 49101179.0, + "reward": 3.8402042388916016, + "reward_std": 0.4300036132335663, + "rewards/reward_fn/mean": 3.8402042388916016, + "rewards/reward_fn/std": 0.4300036132335663, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 121.96875, + "completions/mean_terminated_length": 121.96875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.11212474806407129, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.02238781377673149, + "learning_rate": 7.5776e-06, + "loss": 0.1305, + "num_tokens": 49142554.0, + "reward": 3.9166479110717773, + "reward_std": 0.32798945903778076, + "rewards/reward_fn/mean": 3.9166479110717773, + "rewards/reward_fn/std": 0.32798945903778076, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 107.09375, + "completions/mean_terminated_length": 107.09375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.11223082634984619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2294921875, + "kl": 0.029718552948907018, + "learning_rate": 7.5772e-06, + "loss": 0.0012, + "num_tokens": 49190333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 378.3125, + "completions/mean_terminated_length": 378.3125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.11233690463562108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.020035079680383205, + "learning_rate": 7.5768e-06, + "loss": -0.0189, + "num_tokens": 49247655.0, + "reward": 3.7000904083251953, + "reward_std": 0.5648115277290344, + "rewards/reward_fn/mean": 3.7000904083251953, + "rewards/reward_fn/std": 0.5648115277290344, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 84.5, + "completions/mean_terminated_length": 84.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.11244298292139598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.01883522653952241, + "learning_rate": 7.576399999999999e-06, + "loss": 0.0008, + "num_tokens": 49292759.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 416.625, + "completions/mean_terminated_length": 416.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1125490612071709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.0226012347266078, + "learning_rate": 7.575999999999999e-06, + "loss": 0.0211, + "num_tokens": 49321707.0, + "reward": 3.0304574966430664, + "reward_std": 0.6895338892936707, + "rewards/reward_fn/mean": 3.0304574966430664, + "rewards/reward_fn/std": 0.6895338296890259, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.1126551394929458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.02689792774617672, + "learning_rate": 7.575599999999999e-06, + "loss": 0.0011, + "num_tokens": 49369447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1449.0, + "completions/max_terminated_length": 1449.0, + "completions/mean_length": 288.6875, + "completions/mean_terminated_length": 288.6875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.1127612177787207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.026896960800513625, + "learning_rate": 7.575199999999999e-06, + "loss": 0.0011, + "num_tokens": 49407837.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 242.8125, + "completions/mean_terminated_length": 242.8125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1128672960644956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.03066409262828529, + "learning_rate": 7.574799999999999e-06, + "loss": 0.0108, + "num_tokens": 49437655.0, + "reward": 3.775486469268799, + "reward_std": 0.6500386595726013, + "rewards/reward_fn/mean": 3.775486469268799, + "rewards/reward_fn/std": 0.6500386595726013, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 416.5, + "completions/mean_terminated_length": 416.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.1129733743502705, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.021771566243842244, + "learning_rate": 7.5744e-06, + "loss": -0.0615, + "num_tokens": 49483047.0, + "reward": 3.53684139251709, + "reward_std": 0.9053143858909607, + "rewards/reward_fn/mean": 3.53684139251709, + "rewards/reward_fn/std": 0.9053143858909607, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 160.03125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.11307945263604541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.020757366786710918, + "learning_rate": 7.574e-06, + "loss": 0.0008, + "num_tokens": 49539272.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.11318553092182031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.02480534859932959, + "learning_rate": 7.5736e-06, + "loss": 0.001, + "num_tokens": 49575612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 343.0, + "completions/mean_terminated_length": 343.0, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.1132916092075952, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.019847781863063574, + "learning_rate": 7.5732e-06, + "loss": 0.0209, + "num_tokens": 49620540.0, + "reward": 2.8053698539733887, + "reward_std": 0.044293008744716644, + "rewards/reward_fn/mean": 2.8053698539733887, + "rewards/reward_fn/std": 0.04429301992058754, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 408.25, + "completions/mean_terminated_length": 408.25, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1133976874933701, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.022560104727745056, + "learning_rate": 7.5728e-06, + "loss": -0.0794, + "num_tokens": 49671780.0, + "reward": 2.5189239978790283, + "reward_std": 0.7050349712371826, + "rewards/reward_fn/mean": 2.5189239978790283, + "rewards/reward_fn/std": 0.7050350308418274, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 395.28125, + "completions/mean_terminated_length": 395.28125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.113503765779145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.024412208003923297, + "learning_rate": 7.5724e-06, + "loss": 0.1008, + "num_tokens": 49722989.0, + "reward": 3.4259486198425293, + "reward_std": 0.41865074634552, + "rewards/reward_fn/mean": 3.4259486198425293, + "rewards/reward_fn/std": 0.41865074634552, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 241.15625, + "completions/mean_terminated_length": 241.15625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.1136098440649199, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.019247191143222153, + "learning_rate": 7.572e-06, + "loss": 0.0774, + "num_tokens": 49777074.0, + "reward": 3.9278671741485596, + "reward_std": 0.4080452620983124, + "rewards/reward_fn/mean": 3.9278671741485596, + "rewards/reward_fn/std": 0.4080452620983124, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 486.34375, + "completions/mean_terminated_length": 486.34375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.11371592235069482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.026156109059229493, + "learning_rate": 7.5716e-06, + "loss": -0.0044, + "num_tokens": 49830685.0, + "reward": 2.5489959716796875, + "reward_std": 0.32066285610198975, + "rewards/reward_fn/mean": 2.5489959716796875, + "rewards/reward_fn/std": 0.32066285610198975, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 329.53125, + "completions/mean_terminated_length": 329.53125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.11382200063646972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.023007875541225076, + "learning_rate": 7.5712e-06, + "loss": 0.0351, + "num_tokens": 49877518.0, + "reward": 3.3481032848358154, + "reward_std": 0.6226815581321716, + "rewards/reward_fn/mean": 3.3481032848358154, + "rewards/reward_fn/std": 0.6226814985275269, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 274.34375, + "completions/mean_terminated_length": 274.34375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.11392807892224462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.020330962724983692, + "learning_rate": 7.5708e-06, + "loss": 0.0008, + "num_tokens": 49918969.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 217.8125, + "completions/mean_terminated_length": 217.8125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.11403415720801952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.01607495010830462, + "learning_rate": 7.5703999999999995e-06, + "loss": 0.0006, + "num_tokens": 49970675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 83.125, + "completions/mean_terminated_length": 83.125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.11414023549379441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.018819114891812205, + "learning_rate": 7.5699999999999995e-06, + "loss": 0.0008, + "num_tokens": 50019447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 213.09375, + "completions/mean_terminated_length": 213.09375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.11424631377956933, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.024625253630802035, + "learning_rate": 7.5696e-06, + "loss": -0.1125, + "num_tokens": 50067546.0, + "reward": 3.350964307785034, + "reward_std": 1.0544812679290771, + "rewards/reward_fn/mean": 3.350964307785034, + "rewards/reward_fn/std": 1.0544813871383667, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 319.34375, + "completions/mean_terminated_length": 319.34375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.11435239206534423, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.020671964855864644, + "learning_rate": 7.5692e-06, + "loss": 0.2282, + "num_tokens": 50111141.0, + "reward": 3.702298402786255, + "reward_std": 0.8400141596794128, + "rewards/reward_fn/mean": 3.702298402786255, + "rewards/reward_fn/std": 0.8400141596794128, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 318.1875, + "completions/mean_terminated_length": 318.1875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.11445847035111913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01740014727693051, + "learning_rate": 7.5688e-06, + "loss": 0.0787, + "num_tokens": 50157419.0, + "reward": 3.0187315940856934, + "reward_std": 0.6965243816375732, + "rewards/reward_fn/mean": 3.0187315940856934, + "rewards/reward_fn/std": 0.6965243816375732, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 284.3125, + "completions/mean_terminated_length": 284.3125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.11456454863689403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.024546197149902582, + "learning_rate": 7.568399999999999e-06, + "loss": -0.0078, + "num_tokens": 50187157.0, + "reward": 3.7409169673919678, + "reward_std": 0.4564998149871826, + "rewards/reward_fn/mean": 3.7409169673919678, + "rewards/reward_fn/std": 0.45649975538253784, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 212.3125, + "completions/mean_terminated_length": 212.3125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.11467062692266893, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.02775608957745135, + "learning_rate": 7.567999999999999e-06, + "loss": 0.0725, + "num_tokens": 50225087.0, + "reward": 3.6997857093811035, + "reward_std": 0.4594448208808899, + "rewards/reward_fn/mean": 3.6997857093811035, + "rewards/reward_fn/std": 0.4594447910785675, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 295.6875, + "completions/mean_terminated_length": 295.6875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.11477670520844384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.026229602517560124, + "learning_rate": 7.567599999999999e-06, + "loss": 0.0526, + "num_tokens": 50275957.0, + "reward": 3.969325542449951, + "reward_std": 0.17352108657360077, + "rewards/reward_fn/mean": 3.969325542449951, + "rewards/reward_fn/std": 0.17352110147476196, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 244.90625, + "completions/mean_terminated_length": 244.90625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.11488278349421874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.020248351618647575, + "learning_rate": 7.567199999999999e-06, + "loss": 0.0566, + "num_tokens": 50320498.0, + "reward": 2.9547595977783203, + "reward_std": 0.5952463746070862, + "rewards/reward_fn/mean": 2.9547595977783203, + "rewards/reward_fn/std": 0.5952463746070862, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 227.5, + "completions/mean_terminated_length": 227.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.11498886177999364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.022126831114292145, + "learning_rate": 7.566799999999999e-06, + "loss": 0.0513, + "num_tokens": 50361634.0, + "reward": 3.7756552696228027, + "reward_std": 0.6386120915412903, + "rewards/reward_fn/mean": 3.7756552696228027, + "rewards/reward_fn/std": 0.6386121511459351, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 222.34375, + "completions/mean_terminated_length": 222.34375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.11509494006576854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.019203853677026927, + "learning_rate": 7.566399999999999e-06, + "loss": 0.0008, + "num_tokens": 50396141.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 71.0, + "completions/mean_terminated_length": 71.0, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.11520101835154344, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "kl": 0.023645572364330292, + "learning_rate": 7.565999999999999e-06, + "loss": 0.005, + "num_tokens": 50420653.0, + "reward": 3.9301629066467285, + "reward_std": 0.3950580954551697, + "rewards/reward_fn/mean": 3.9301629066467285, + "rewards/reward_fn/std": 0.3950580954551697, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 92.21875, + "completions/mean_terminated_length": 92.21875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.11530709663731833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2177734375, + "kl": 0.021084198029711843, + "learning_rate": 7.565599999999999e-06, + "loss": 0.0008, + "num_tokens": 50465332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 313.78125, + "completions/mean_terminated_length": 313.78125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.11541317492309325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.02206204435788095, + "learning_rate": 7.565199999999999e-06, + "loss": 0.0009, + "num_tokens": 50512781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.11551925320886815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.01822281803470105, + "learning_rate": 7.5648e-06, + "loss": 0.0007, + "num_tokens": 50551677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 83.59375, + "completions/mean_terminated_length": 83.59375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.11562533149464305, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.027714208932593465, + "learning_rate": 7.5644e-06, + "loss": -0.0259, + "num_tokens": 50628176.0, + "reward": 3.9305734634399414, + "reward_std": 0.39273524284362793, + "rewards/reward_fn/mean": 3.9305734634399414, + "rewards/reward_fn/std": 0.39273524284362793, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 315.15625, + "completions/mean_terminated_length": 315.15625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.11573140978041795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.02233049925416708, + "learning_rate": 7.564e-06, + "loss": 0.0933, + "num_tokens": 50683957.0, + "reward": 3.8804166316986084, + "reward_std": 0.3777785003185272, + "rewards/reward_fn/mean": 3.8804166316986084, + "rewards/reward_fn/std": 0.3777785003185272, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 371.125, + "completions/mean_terminated_length": 371.125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.11583748806619285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.029797785449773073, + "learning_rate": 7.5636e-06, + "loss": 0.0461, + "num_tokens": 50751865.0, + "reward": 2.858231544494629, + "reward_std": 1.1363381147384644, + "rewards/reward_fn/mean": 2.858231544494629, + "rewards/reward_fn/std": 1.1363381147384644, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 131.09375, + "completions/mean_terminated_length": 131.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.11594356635196776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.021672799484804273, + "learning_rate": 7.5632e-06, + "loss": 0.0009, + "num_tokens": 50799932.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 330.0625, + "completions/mean_terminated_length": 330.0625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.11604964463774266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.017305174726061523, + "learning_rate": 7.5628e-06, + "loss": 0.0007, + "num_tokens": 50857982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 243.03125, + "completions/mean_terminated_length": 243.03125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.11615572292351756, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.02653159131295979, + "learning_rate": 7.5624e-06, + "loss": -0.0145, + "num_tokens": 50914719.0, + "reward": 3.928218126296997, + "reward_std": 0.40605998039245605, + "rewards/reward_fn/mean": 3.928218126296997, + "rewards/reward_fn/std": 0.40606001019477844, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 158.625, + "completions/mean_terminated_length": 158.625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.11626180120929246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.02485039341263473, + "learning_rate": 7.562e-06, + "loss": 0.001, + "num_tokens": 50947411.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1744.0, + "completions/max_terminated_length": 1744.0, + "completions/mean_length": 693.3125, + "completions/mean_terminated_length": 693.3125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.11636787949506736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.02324887551367283, + "learning_rate": 7.5616e-06, + "loss": 0.0876, + "num_tokens": 51001437.0, + "reward": 2.568819522857666, + "reward_std": 0.38974249362945557, + "rewards/reward_fn/mean": 2.568819522857666, + "rewards/reward_fn/std": 0.3897424340248108, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 1142.0, + "completions/mean_length": 305.78125, + "completions/mean_terminated_length": 305.78125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.11647395778084225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.018533728667534888, + "learning_rate": 7.5612e-06, + "loss": 0.0793, + "num_tokens": 51048822.0, + "reward": 3.68546724319458, + "reward_std": 0.5669353008270264, + "rewards/reward_fn/mean": 3.68546724319458, + "rewards/reward_fn/std": 0.5669353008270264, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 78.71875, + "completions/mean_terminated_length": 78.71875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.11658003606661717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.02107345312833786, + "learning_rate": 7.5608e-06, + "loss": 0.0008, + "num_tokens": 51085869.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 238.3125, + "completions/mean_terminated_length": 238.3125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.11668611435239207, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.021306362934410572, + "learning_rate": 7.5604e-06, + "loss": 0.0548, + "num_tokens": 51137207.0, + "reward": 2.826180934906006, + "reward_std": 0.03045596368610859, + "rewards/reward_fn/mean": 2.826180934906006, + "rewards/reward_fn/std": 0.03045591339468956, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 244.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.11679219263816697, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.022923755925148726, + "learning_rate": 7.56e-06, + "loss": 0.0168, + "num_tokens": 51193855.0, + "reward": 3.835073947906494, + "reward_std": 0.48954635858535767, + "rewards/reward_fn/mean": 3.835073947906494, + "rewards/reward_fn/std": 0.48954638838768005, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 208.34375, + "completions/mean_terminated_length": 208.34375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.11689827092394187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.016427406924776733, + "learning_rate": 7.5596e-06, + "loss": 0.0007, + "num_tokens": 51233450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 275.15625, + "completions/mean_terminated_length": 275.15625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.11700434920971677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02642908156849444, + "learning_rate": 7.5591999999999996e-06, + "loss": -0.0509, + "num_tokens": 51272687.0, + "reward": 2.866469144821167, + "reward_std": 0.3955315351486206, + "rewards/reward_fn/mean": 2.866469144821167, + "rewards/reward_fn/std": 0.395531564950943, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 786.0, + "completions/mean_terminated_length": 745.290283203125, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.11711042749549168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.02294124150648713, + "learning_rate": 7.5587999999999995e-06, + "loss": 0.1011, + "num_tokens": 51341903.0, + "reward": 2.7701563835144043, + "reward_std": 0.355268269777298, + "rewards/reward_fn/mean": 2.7701563835144043, + "rewards/reward_fn/std": 0.3552682399749756, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 213.4375, + "completions/mean_terminated_length": 213.4375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.11721650578126658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.017608007532544434, + "learning_rate": 7.5583999999999995e-06, + "loss": 0.0007, + "num_tokens": 51382781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 365.03125, + "completions/mean_terminated_length": 365.03125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.11732258406704148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.021670927526429296, + "learning_rate": 7.5579999999999995e-06, + "loss": 0.0421, + "num_tokens": 51425758.0, + "reward": 2.7978286743164062, + "reward_std": 0.512295663356781, + "rewards/reward_fn/mean": 2.7978286743164062, + "rewards/reward_fn/std": 0.512295663356781, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1947.0, + "completions/max_terminated_length": 1947.0, + "completions/mean_length": 535.5625, + "completions/mean_terminated_length": 535.5625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.11742866235281638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.01689821679610759, + "learning_rate": 7.5575999999999994e-06, + "loss": -0.0038, + "num_tokens": 51493776.0, + "reward": 2.7099485397338867, + "reward_std": 0.6777162551879883, + "rewards/reward_fn/mean": 2.7099485397338867, + "rewards/reward_fn/std": 0.6777163147926331, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.11753474063859128, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.02550308988429606, + "learning_rate": 7.557199999999999e-06, + "loss": 0.1913, + "num_tokens": 51556496.0, + "reward": 3.723670482635498, + "reward_std": 0.4863794147968292, + "rewards/reward_fn/mean": 3.723670482635498, + "rewards/reward_fn/std": 0.4863794147968292, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1831.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 429.0625, + "completions/mean_terminated_length": 429.0625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.11764081892436619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01923588034696877, + "learning_rate": 7.556799999999999e-06, + "loss": 0.0173, + "num_tokens": 51599986.0, + "reward": 2.7359695434570312, + "reward_std": 0.044656820595264435, + "rewards/reward_fn/mean": 2.7359695434570312, + "rewards/reward_fn/std": 0.044656842947006226, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 275.46875, + "completions/mean_terminated_length": 275.46875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.11774689721014109, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.018184081302024424, + "learning_rate": 7.556399999999999e-06, + "loss": 0.0582, + "num_tokens": 51651681.0, + "reward": 3.522010564804077, + "reward_std": 0.5560933351516724, + "rewards/reward_fn/mean": 3.522010564804077, + "rewards/reward_fn/std": 0.5560933351516724, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 256.96875, + "completions/mean_terminated_length": 256.96875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.11785297549591599, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.02644204581156373, + "learning_rate": 7.555999999999999e-06, + "loss": 0.0319, + "num_tokens": 51691008.0, + "reward": 3.9324989318847656, + "reward_std": 0.26562514901161194, + "rewards/reward_fn/mean": 3.9324989318847656, + "rewards/reward_fn/std": 0.26562511920928955, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 358.5, + "completions/mean_terminated_length": 358.5, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.11795905378169089, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.020751751959323883, + "learning_rate": 7.5556e-06, + "loss": -0.0171, + "num_tokens": 51747120.0, + "reward": 2.6532459259033203, + "reward_std": 0.33404526114463806, + "rewards/reward_fn/mean": 2.6532459259033203, + "rewards/reward_fn/std": 0.33404526114463806, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 447.53125, + "completions/mean_terminated_length": 447.53125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.11806513206746579, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.025134951109066606, + "learning_rate": 7.5552e-06, + "loss": 0.1821, + "num_tokens": 51800769.0, + "reward": 3.224299192428589, + "reward_std": 0.5572682619094849, + "rewards/reward_fn/mean": 3.224299192428589, + "rewards/reward_fn/std": 0.5572682619094849, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 395.875, + "completions/mean_terminated_length": 395.875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.11817121035324069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02163501945324242, + "learning_rate": 7.5548e-06, + "loss": 0.0656, + "num_tokens": 51843965.0, + "reward": 3.863697052001953, + "reward_std": 0.36755073070526123, + "rewards/reward_fn/mean": 3.863697052001953, + "rewards/reward_fn/std": 0.36755073070526123, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 136.5625, + "completions/mean_terminated_length": 136.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.1182772886390156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.030038248049095273, + "learning_rate": 7.5544e-06, + "loss": 0.0012, + "num_tokens": 51894063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 412.90625, + "completions/mean_terminated_length": 412.90625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.1183833669247905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.018813129048794508, + "learning_rate": 7.554e-06, + "loss": -0.0249, + "num_tokens": 51940748.0, + "reward": 3.8100392818450928, + "reward_std": 0.5366294384002686, + "rewards/reward_fn/mean": 3.8100392818450928, + "rewards/reward_fn/std": 0.5366293787956238, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 297.09375, + "completions/mean_terminated_length": 240.61289978027344, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.1184894452105654, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01792726945132017, + "learning_rate": 7.5536e-06, + "loss": 0.281, + "num_tokens": 51983695.0, + "reward": 3.659058094024658, + "reward_std": 0.8081573843955994, + "rewards/reward_fn/mean": 3.659058094024658, + "rewards/reward_fn/std": 0.8081573843955994, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 244.9375, + "completions/mean_terminated_length": 244.9375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.1185955234963403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02788339671678841, + "learning_rate": 7.5532e-06, + "loss": 0.0633, + "num_tokens": 52025869.0, + "reward": 2.9600257873535156, + "reward_std": 0.5432107448577881, + "rewards/reward_fn/mean": 2.9600257873535156, + "rewards/reward_fn/std": 0.5432106852531433, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 109.15625, + "completions/mean_terminated_length": 109.15625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.1187016017821152, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.03897233330644667, + "learning_rate": 7.5528e-06, + "loss": 0.043, + "num_tokens": 52061970.0, + "reward": 3.0810916423797607, + "reward_std": 0.24325308203697205, + "rewards/reward_fn/mean": 3.0810916423797607, + "rewards/reward_fn/std": 0.24325308203697205, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 374.53125, + "completions/mean_terminated_length": 374.53125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.11880768006789011, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.017800742178224027, + "learning_rate": 7.552399999999999e-06, + "loss": 0.0929, + "num_tokens": 52104931.0, + "reward": 3.1744065284729004, + "reward_std": 0.5665863156318665, + "rewards/reward_fn/mean": 3.1744065284729004, + "rewards/reward_fn/std": 0.5665862560272217, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 117.1875, + "completions/mean_terminated_length": 117.1875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.11891375835366501, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.031384273897856474, + "learning_rate": 7.551999999999999e-06, + "loss": 0.0092, + "num_tokens": 52142857.0, + "reward": 3.7206034660339355, + "reward_std": 0.6296123266220093, + "rewards/reward_fn/mean": 3.7206034660339355, + "rewards/reward_fn/std": 0.6296123266220093, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 336.40625, + "completions/mean_terminated_length": 336.40625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.11901983663943991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.020996463019400835, + "learning_rate": 7.551599999999999e-06, + "loss": -0.1134, + "num_tokens": 52188662.0, + "reward": 3.5195388793945312, + "reward_std": 0.6306620836257935, + "rewards/reward_fn/mean": 3.5195388793945312, + "rewards/reward_fn/std": 0.6306621432304382, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 321.84375, + "completions/mean_terminated_length": 321.84375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.11912591492521481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01977117615751922, + "learning_rate": 7.551199999999999e-06, + "loss": 0.0567, + "num_tokens": 52233617.0, + "reward": 3.0178513526916504, + "reward_std": 0.6891320943832397, + "rewards/reward_fn/mean": 3.0178513526916504, + "rewards/reward_fn/std": 0.689132034778595, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1633.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 547.84375, + "completions/mean_terminated_length": 547.84375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1192319932109897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.020315645029768348, + "learning_rate": 7.5508e-06, + "loss": -0.0442, + "num_tokens": 52295468.0, + "reward": 2.052112340927124, + "reward_std": 0.5385510921478271, + "rewards/reward_fn/mean": 2.052112340927124, + "rewards/reward_fn/std": 0.5385510921478271, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1653.0, + "completions/max_terminated_length": 1653.0, + "completions/mean_length": 408.5625, + "completions/mean_terminated_length": 408.5625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1193380714967646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.0202141257468611, + "learning_rate": 7.5504e-06, + "loss": 0.0394, + "num_tokens": 52348126.0, + "reward": 2.4590096473693848, + "reward_std": 0.5020030736923218, + "rewards/reward_fn/mean": 2.4590096473693848, + "rewards/reward_fn/std": 0.5020030736923218, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 145.6875, + "completions/mean_terminated_length": 145.6875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.11944414978253952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.03049472742713988, + "learning_rate": 7.55e-06, + "loss": 0.0012, + "num_tokens": 52397780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 250.96875, + "completions/mean_terminated_length": 250.96875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.11955022806831442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.037999976659193635, + "learning_rate": 7.5496e-06, + "loss": -0.0338, + "num_tokens": 52438483.0, + "reward": 2.8391571044921875, + "reward_std": 0.20920641720294952, + "rewards/reward_fn/mean": 2.8391571044921875, + "rewards/reward_fn/std": 0.20920643210411072, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.11965630635408932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02898483257740736, + "learning_rate": 7.5492e-06, + "loss": 0.0341, + "num_tokens": 52476727.0, + "reward": 3.4120028018951416, + "reward_std": 0.562468409538269, + "rewards/reward_fn/mean": 3.4120028018951416, + "rewards/reward_fn/std": 0.5624683499336243, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1293.34375, + "completions/mean_terminated_length": 1153.5926513671875, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "epoch": 0.11976238463986422, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.018078222521580756, + "learning_rate": 7.5488e-06, + "loss": 0.1773, + "num_tokens": 52560418.0, + "reward": 1.8458271026611328, + "reward_std": 0.9291160106658936, + "rewards/reward_fn/mean": 1.8458271026611328, + "rewards/reward_fn/std": 0.9291160702705383, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.11986846292563912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.021710637491196394, + "learning_rate": 7.5484e-06, + "loss": 0.0009, + "num_tokens": 52611342.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 125.28125, + "completions/mean_terminated_length": 125.28125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.11997454121141403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.012505219259765, + "learning_rate": 7.5479999999999996e-06, + "loss": 0.0005, + "num_tokens": 52658647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 140.71875, + "completions/mean_terminated_length": 140.71875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12008061949718893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.018215686664916575, + "learning_rate": 7.5475999999999995e-06, + "loss": 0.0007, + "num_tokens": 52714286.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 187.4375, + "completions/mean_terminated_length": 187.4375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.12018669778296383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.02393046487122774, + "learning_rate": 7.5471999999999995e-06, + "loss": 0.001, + "num_tokens": 52756636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1577.0, + "completions/max_terminated_length": 1577.0, + "completions/mean_length": 313.84375, + "completions/mean_terminated_length": 313.84375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.12029277606873873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.026081232354044914, + "learning_rate": 7.5467999999999995e-06, + "loss": 0.0927, + "num_tokens": 52795991.0, + "reward": 3.3418869972229004, + "reward_std": 0.6929866671562195, + "rewards/reward_fn/mean": 3.3418869972229004, + "rewards/reward_fn/std": 0.6929866671562195, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 190.78125, + "completions/mean_terminated_length": 190.78125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.12039885435451363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.021782017778605223, + "learning_rate": 7.5464e-06, + "loss": 0.0009, + "num_tokens": 52848784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 210.65625, + "completions/mean_terminated_length": 210.65625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.12050493264028854, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.022625808138400316, + "learning_rate": 7.546e-06, + "loss": 0.0627, + "num_tokens": 52891269.0, + "reward": 3.7008187770843506, + "reward_std": 0.7357956767082214, + "rewards/reward_fn/mean": 3.7008187770843506, + "rewards/reward_fn/std": 0.7357956767082214, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 94.84375, + "completions/mean_terminated_length": 94.84375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.12061101092606344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.024228210328146815, + "learning_rate": 7.5456e-06, + "loss": 0.001, + "num_tokens": 52945920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 116.78125, + "completions/mean_terminated_length": 116.78125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.12071708921183834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.027215805603191257, + "learning_rate": 7.5452e-06, + "loss": 0.0011, + "num_tokens": 52977177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1196.0, + "completions/max_terminated_length": 1196.0, + "completions/mean_length": 333.34375, + "completions/mean_terminated_length": 333.34375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.12082316749761324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.021321170032024384, + "learning_rate": 7.5448e-06, + "loss": 0.0052, + "num_tokens": 53023300.0, + "reward": 3.9064579010009766, + "reward_std": 0.2969244122505188, + "rewards/reward_fn/mean": 3.9064579010009766, + "rewards/reward_fn/std": 0.2969244122505188, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 79.90625, + "completions/mean_terminated_length": 79.90625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.12092924578338814, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.953125, + "kl": 0.018354161293245852, + "learning_rate": 7.5444e-06, + "loss": -0.0223, + "num_tokens": 53060129.0, + "reward": 3.9402408599853516, + "reward_std": 0.23523320257663727, + "rewards/reward_fn/mean": 3.9402408599853516, + "rewards/reward_fn/std": 0.23523321747779846, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 429.6875, + "completions/mean_terminated_length": 429.6875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.12103532406916304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.02179452800191939, + "learning_rate": 7.543999999999999e-06, + "loss": 0.01, + "num_tokens": 53106231.0, + "reward": 2.8876233100891113, + "reward_std": 0.218718022108078, + "rewards/reward_fn/mean": 2.8876233100891113, + "rewards/reward_fn/std": 0.21871797740459442, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1471.0, + "completions/max_terminated_length": 1471.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.12114140235493795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.03363496996462345, + "learning_rate": 7.543599999999999e-06, + "loss": 0.0013, + "num_tokens": 53162131.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 206.1875, + "completions/mean_terminated_length": 206.1875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.12124748064071285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.017744799610227346, + "learning_rate": 7.543199999999999e-06, + "loss": 0.0345, + "num_tokens": 53198905.0, + "reward": 2.7245864868164062, + "reward_std": 0.032482411712408066, + "rewards/reward_fn/mean": 2.7245864868164062, + "rewards/reward_fn/std": 0.032482437789440155, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 142.875, + "completions/mean_terminated_length": 142.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.12135355892648775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.036823015194386244, + "learning_rate": 7.542799999999999e-06, + "loss": 0.0015, + "num_tokens": 53243669.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 206.75, + "completions/mean_terminated_length": 206.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.12145963721226265, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.023123034741729498, + "learning_rate": 7.542399999999999e-06, + "loss": -0.0627, + "num_tokens": 53288429.0, + "reward": 2.7684268951416016, + "reward_std": 0.05548159033060074, + "rewards/reward_fn/mean": 2.7684268951416016, + "rewards/reward_fn/std": 0.055481597781181335, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 479.15625, + "completions/mean_terminated_length": 479.15625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.12156571549803755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.017751626786775887, + "learning_rate": 7.541999999999999e-06, + "loss": -0.0229, + "num_tokens": 53358578.0, + "reward": 2.996565580368042, + "reward_std": 0.6444182991981506, + "rewards/reward_fn/mean": 2.996565580368042, + "rewards/reward_fn/std": 0.6444182395935059, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 200.71875, + "completions/mean_terminated_length": 200.71875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.12167179378381246, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.029425512766465545, + "learning_rate": 7.5416e-06, + "loss": 0.1157, + "num_tokens": 53400937.0, + "reward": 3.5051753520965576, + "reward_std": 0.6135579943656921, + "rewards/reward_fn/mean": 3.5051753520965576, + "rewards/reward_fn/std": 0.6135579347610474, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 305.125, + "completions/mean_terminated_length": 305.125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.12177787206958736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.020469692768529058, + "learning_rate": 7.5412e-06, + "loss": 0.0535, + "num_tokens": 53450349.0, + "reward": 2.800945997238159, + "reward_std": 0.287587434053421, + "rewards/reward_fn/mean": 2.800945997238159, + "rewards/reward_fn/std": 0.28758740425109863, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 209.21875, + "completions/mean_terminated_length": 209.21875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.12188395035536226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.022445352748036385, + "learning_rate": 7.5408e-06, + "loss": -0.0242, + "num_tokens": 53515540.0, + "reward": 3.882132053375244, + "reward_std": 0.372415155172348, + "rewards/reward_fn/mean": 3.882132053375244, + "rewards/reward_fn/std": 0.372415155172348, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 227.34375, + "completions/mean_terminated_length": 227.34375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.12199002864113716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.020551395835354924, + "learning_rate": 7.5404e-06, + "loss": 0.0163, + "num_tokens": 53560575.0, + "reward": 2.9072699546813965, + "reward_std": 0.2896516025066376, + "rewards/reward_fn/mean": 2.9072699546813965, + "rewards/reward_fn/std": 0.2896515727043152, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.12209610692691206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.024549536872655153, + "learning_rate": 7.54e-06, + "loss": 0.0296, + "num_tokens": 53605867.0, + "reward": 2.8168137073516846, + "reward_std": 0.31276002526283264, + "rewards/reward_fn/mean": 2.8168137073516846, + "rewards/reward_fn/std": 0.31275999546051025, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1899.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 856.65625, + "completions/mean_terminated_length": 856.65625, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.12220218521268696, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.01613958622328937, + "learning_rate": 7.5396e-06, + "loss": 0.1343, + "num_tokens": 53658240.0, + "reward": 2.469219207763672, + "reward_std": 0.45067059993743896, + "rewards/reward_fn/mean": 2.469219207763672, + "rewards/reward_fn/std": 0.4506705403327942, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 313.40625, + "completions/mean_terminated_length": 313.40625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.12230826349846187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.026833078358322382, + "learning_rate": 7.5392e-06, + "loss": 0.0304, + "num_tokens": 53720845.0, + "reward": 2.7918012142181396, + "reward_std": 0.5359998941421509, + "rewards/reward_fn/mean": 2.7918012142181396, + "rewards/reward_fn/std": 0.5359998941421509, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 152.15625, + "completions/mean_terminated_length": 152.15625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.12241434178423677, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.0193988133687526, + "learning_rate": 7.5388e-06, + "loss": 0.0837, + "num_tokens": 53770290.0, + "reward": 2.6622109413146973, + "reward_std": 0.07657734304666519, + "rewards/reward_fn/mean": 2.6622109413146973, + "rewards/reward_fn/std": 0.0765773355960846, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 345.46875, + "completions/mean_terminated_length": 345.46875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.12252042007001167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.01845894439611584, + "learning_rate": 7.5384e-06, + "loss": 0.0443, + "num_tokens": 53810017.0, + "reward": 2.8654346466064453, + "reward_std": 0.06292664259672165, + "rewards/reward_fn/mean": 2.8654346466064453, + "rewards/reward_fn/std": 0.06292665004730225, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 109.0625, + "completions/mean_terminated_length": 109.0625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.12262649835578657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.02120021218433976, + "learning_rate": 7.538e-06, + "loss": 0.0008, + "num_tokens": 53857475.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 180.1875, + "completions/mean_terminated_length": 180.1875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.12273257664156147, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.021761987125501037, + "learning_rate": 7.5376e-06, + "loss": 0.0012, + "num_tokens": 53895881.0, + "reward": 3.3576347827911377, + "reward_std": 0.5774080157279968, + "rewards/reward_fn/mean": 3.3576347827911377, + "rewards/reward_fn/std": 0.5774080157279968, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 371.28125, + "completions/mean_terminated_length": 371.28125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.12283865492733638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.02653820696286857, + "learning_rate": 7.5372e-06, + "loss": 0.0207, + "num_tokens": 53918706.0, + "reward": 3.2473506927490234, + "reward_std": 0.6305427551269531, + "rewards/reward_fn/mean": 3.2473506927490234, + "rewards/reward_fn/std": 0.6305428147315979, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 132.34375, + "completions/mean_terminated_length": 132.34375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.12294473321311128, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.02793702925555408, + "learning_rate": 7.5368e-06, + "loss": 0.0161, + "num_tokens": 53953661.0, + "reward": 3.8980746269226074, + "reward_std": 0.3230654299259186, + "rewards/reward_fn/mean": 3.8980746269226074, + "rewards/reward_fn/std": 0.3230654001235962, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 207.53125, + "completions/mean_terminated_length": 207.53125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.12305081149888618, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.029348223470151424, + "learning_rate": 7.5364e-06, + "loss": 0.0201, + "num_tokens": 53978894.0, + "reward": 3.4990692138671875, + "reward_std": 0.6988479495048523, + "rewards/reward_fn/mean": 3.4990692138671875, + "rewards/reward_fn/std": 0.6988478899002075, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 216.6875, + "completions/mean_terminated_length": 216.6875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.12315688978466108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.03555730218067765, + "learning_rate": 7.5359999999999995e-06, + "loss": 0.0271, + "num_tokens": 54005508.0, + "reward": 3.8622612953186035, + "reward_std": 0.37032949924468994, + "rewards/reward_fn/mean": 3.8622612953186035, + "rewards/reward_fn/std": 0.37032952904701233, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 353.6875, + "completions/mean_terminated_length": 353.6875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.12326296807043598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.018009544699452817, + "learning_rate": 7.5355999999999995e-06, + "loss": -0.0023, + "num_tokens": 54058490.0, + "reward": 3.888385772705078, + "reward_std": 0.3552592396736145, + "rewards/reward_fn/mean": 3.888385772705078, + "rewards/reward_fn/std": 0.3552592098712921, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 222.125, + "completions/mean_terminated_length": 222.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.12336904635621089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.02037247340194881, + "learning_rate": 7.5351999999999994e-06, + "loss": 0.0008, + "num_tokens": 54102334.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 307.5625, + "completions/mean_terminated_length": 307.5625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.12347512464198579, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.026630163891240954, + "learning_rate": 7.534799999999999e-06, + "loss": 0.0552, + "num_tokens": 54132944.0, + "reward": 3.7399346828460693, + "reward_std": 0.5008683800697327, + "rewards/reward_fn/mean": 3.7399346828460693, + "rewards/reward_fn/std": 0.5008684396743774, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 327.96875, + "completions/mean_terminated_length": 327.96875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.12358120292776069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.024272862123325467, + "learning_rate": 7.534399999999999e-06, + "loss": 0.2175, + "num_tokens": 54182607.0, + "reward": 3.42256236076355, + "reward_std": 1.0162280797958374, + "rewards/reward_fn/mean": 3.42256236076355, + "rewards/reward_fn/std": 1.0162280797958374, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 255.90625, + "completions/mean_terminated_length": 255.90625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.12368728121353559, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.02412836579605937, + "learning_rate": 7.533999999999999e-06, + "loss": 0.1033, + "num_tokens": 54237196.0, + "reward": 3.846345901489258, + "reward_std": 0.4130762219429016, + "rewards/reward_fn/mean": 3.846345901489258, + "rewards/reward_fn/std": 0.4130762219429016, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1370.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 366.46875, + "completions/mean_terminated_length": 366.46875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.12379335949931049, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.024113824125379324, + "learning_rate": 7.533599999999999e-06, + "loss": 0.003, + "num_tokens": 54280251.0, + "reward": 3.4150004386901855, + "reward_std": 0.6626617312431335, + "rewards/reward_fn/mean": 3.4150004386901855, + "rewards/reward_fn/std": 0.6626616716384888, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 200.25, + "completions/mean_terminated_length": 200.25, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.12389943778508539, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.025690929731354117, + "learning_rate": 7.533199999999999e-06, + "loss": 0.0769, + "num_tokens": 54321059.0, + "reward": 2.788593053817749, + "reward_std": 0.048831358551979065, + "rewards/reward_fn/mean": 2.788593053817749, + "rewards/reward_fn/std": 0.048831358551979065, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1791.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 496.5625, + "completions/mean_terminated_length": 496.5625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.1240055160708603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.017352971248328686, + "learning_rate": 7.532799999999999e-06, + "loss": -0.0641, + "num_tokens": 54375925.0, + "reward": 3.188598871231079, + "reward_std": 0.6397762894630432, + "rewards/reward_fn/mean": 3.188598871231079, + "rewards/reward_fn/std": 0.6397762298583984, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 142.9375, + "completions/mean_terminated_length": 142.9375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.1241115943566352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2451171875, + "kl": 0.06317757535725832, + "learning_rate": 7.532399999999999e-06, + "loss": 0.0025, + "num_tokens": 54416211.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 146.0625, + "completions/mean_terminated_length": 146.0625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1242176726424101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.02488727425225079, + "learning_rate": 7.532e-06, + "loss": 0.001, + "num_tokens": 54455445.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 187.5625, + "completions/mean_terminated_length": 187.5625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.124323750928185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.026105482364073396, + "learning_rate": 7.5316e-06, + "loss": 0.0265, + "num_tokens": 54492679.0, + "reward": 3.9259839057922363, + "reward_std": 0.291323721408844, + "rewards/reward_fn/mean": 3.9259839057922363, + "rewards/reward_fn/std": 0.291323721408844, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 231.875, + "completions/mean_terminated_length": 231.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1244298292139599, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.015331451082602143, + "learning_rate": 7.5312e-06, + "loss": 0.0222, + "num_tokens": 54553795.0, + "reward": 3.9769577980041504, + "reward_std": 0.13034707307815552, + "rewards/reward_fn/mean": 3.9769577980041504, + "rewards/reward_fn/std": 0.1303471028804779, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 100.8125, + "completions/mean_terminated_length": 100.8125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.12453590749973481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.03184149111621082, + "learning_rate": 7.5308e-06, + "loss": 0.0013, + "num_tokens": 54591645.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 347.1875, + "completions/mean_terminated_length": 347.1875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.12464198578550971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.027322919107973576, + "learning_rate": 7.5304e-06, + "loss": -0.2403, + "num_tokens": 54634275.0, + "reward": 2.2521934509277344, + "reward_std": 0.5425410866737366, + "rewards/reward_fn/mean": 2.2521934509277344, + "rewards/reward_fn/std": 0.5425410866737366, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 264.90625, + "completions/mean_terminated_length": 264.90625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.12474806407128461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.01803676167037338, + "learning_rate": 7.53e-06, + "loss": -0.0299, + "num_tokens": 54664704.0, + "reward": 3.8208837509155273, + "reward_std": 0.4239185154438019, + "rewards/reward_fn/mean": 3.8208837509155273, + "rewards/reward_fn/std": 0.4239185154438019, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1419.0, + "completions/max_terminated_length": 1419.0, + "completions/mean_length": 380.90625, + "completions/mean_terminated_length": 380.90625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.12485414235705951, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.02894961880519986, + "learning_rate": 7.5296e-06, + "loss": -0.0024, + "num_tokens": 54708189.0, + "reward": 3.098309278488159, + "reward_std": 0.35013359785079956, + "rewards/reward_fn/mean": 3.098309278488159, + "rewards/reward_fn/std": 0.35013359785079956, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 196.8125, + "completions/mean_terminated_length": 196.8125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.12496022064283441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.020095350686460733, + "learning_rate": 7.5292e-06, + "loss": 0.0008, + "num_tokens": 54760727.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 180.09375, + "completions/mean_terminated_length": 180.09375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.12506629892860932, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.028536976547911763, + "learning_rate": 7.5288e-06, + "loss": 0.1951, + "num_tokens": 54812314.0, + "reward": 3.8505232334136963, + "reward_std": 0.40203312039375305, + "rewards/reward_fn/mean": 3.8505232334136963, + "rewards/reward_fn/std": 0.40203315019607544, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 140.3125, + "completions/mean_terminated_length": 140.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.1251723772143842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.028160194400697947, + "learning_rate": 7.5284e-06, + "loss": 0.0011, + "num_tokens": 54844484.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.12527845550015912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.018880587071180344, + "learning_rate": 7.527999999999999e-06, + "loss": 0.0008, + "num_tokens": 54892440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 108.75, + "completions/mean_terminated_length": 108.75, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.12538453378593403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.030261845095083117, + "learning_rate": 7.527599999999999e-06, + "loss": 0.0012, + "num_tokens": 54919440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 353.9375, + "completions/mean_terminated_length": 353.9375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.12549061207170892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.019826365518383682, + "learning_rate": 7.5272e-06, + "loss": 0.0081, + "num_tokens": 54949006.0, + "reward": 3.937495231628418, + "reward_std": 0.24625274538993835, + "rewards/reward_fn/mean": 3.937495231628418, + "rewards/reward_fn/std": 0.24625271558761597, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 138.625, + "completions/mean_terminated_length": 138.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.12559669035748383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.02072442672215402, + "learning_rate": 7.5268e-06, + "loss": 0.0357, + "num_tokens": 54983426.0, + "reward": 2.824800968170166, + "reward_std": 0.023115260526537895, + "rewards/reward_fn/mean": 2.824800968170166, + "rewards/reward_fn/std": 0.023115256801247597, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 375.9375, + "completions/mean_terminated_length": 375.9375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.12570276864325872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.01736769184935838, + "learning_rate": 7.5264e-06, + "loss": -0.0127, + "num_tokens": 55026240.0, + "reward": 3.8968665599823, + "reward_std": 0.33643701672554016, + "rewards/reward_fn/mean": 3.8968665599823, + "rewards/reward_fn/std": 0.3364369869232178, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 286.40625, + "completions/mean_terminated_length": 286.40625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.12580884692903363, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.021791806211695075, + "learning_rate": 7.526e-06, + "loss": -0.0103, + "num_tokens": 55069293.0, + "reward": 3.151297092437744, + "reward_std": 0.42361894249916077, + "rewards/reward_fn/mean": 3.151297092437744, + "rewards/reward_fn/std": 0.4236189126968384, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 231.46875, + "completions/mean_terminated_length": 231.46875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.12591492521480852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.022306752856820822, + "learning_rate": 7.5255999999999996e-06, + "loss": 0.0999, + "num_tokens": 55109244.0, + "reward": 2.9199135303497314, + "reward_std": 0.038811203092336655, + "rewards/reward_fn/mean": 2.9199135303497314, + "rewards/reward_fn/std": 0.03881121799349785, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1251.0, + "completions/max_terminated_length": 1251.0, + "completions/mean_length": 444.8125, + "completions/mean_terminated_length": 444.8125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.12602100350058343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.016548768035136163, + "learning_rate": 7.5251999999999995e-06, + "loss": -0.0012, + "num_tokens": 55156630.0, + "reward": 2.838229179382324, + "reward_std": 0.38113701343536377, + "rewards/reward_fn/mean": 2.838229179382324, + "rewards/reward_fn/std": 0.381136953830719, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 224.09375, + "completions/mean_terminated_length": 224.09375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.12612708178635834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.022972247563302517, + "learning_rate": 7.5247999999999995e-06, + "loss": 0.0009, + "num_tokens": 55201497.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 227.125, + "completions/mean_terminated_length": 227.125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.12623316007213323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.016570686595514417, + "learning_rate": 7.5243999999999995e-06, + "loss": 0.0007, + "num_tokens": 55253885.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 120.625, + "completions/mean_terminated_length": 120.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.12633923835790814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.01669433683855459, + "learning_rate": 7.5239999999999995e-06, + "loss": 0.0007, + "num_tokens": 55299857.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 317.1875, + "completions/mean_terminated_length": 317.1875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.12644531664368303, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.017451874213293195, + "learning_rate": 7.523599999999999e-06, + "loss": 0.042, + "num_tokens": 55343415.0, + "reward": 3.317841053009033, + "reward_std": 0.36798393726348877, + "rewards/reward_fn/mean": 3.317841053009033, + "rewards/reward_fn/std": 0.3679839074611664, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 209.03125, + "completions/mean_terminated_length": 209.03125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.12655139492945794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.02464919281192124, + "learning_rate": 7.523199999999999e-06, + "loss": 0.001, + "num_tokens": 55384440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 233.65625, + "completions/mean_terminated_length": 233.65625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.12665747321523285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.018277527298778296, + "learning_rate": 7.5228e-06, + "loss": 0.1391, + "num_tokens": 55425453.0, + "reward": 2.884368419647217, + "reward_std": 0.03824806585907936, + "rewards/reward_fn/mean": 2.884368419647217, + "rewards/reward_fn/std": 0.038248054683208466, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 469.59375, + "completions/mean_terminated_length": 469.59375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.12676355150100774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.022253156173974276, + "learning_rate": 7.5224e-06, + "loss": 0.1067, + "num_tokens": 55476704.0, + "reward": 2.19340443611145, + "reward_std": 0.5337251424789429, + "rewards/reward_fn/mean": 2.19340443611145, + "rewards/reward_fn/std": 0.5337251424789429, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 283.71875, + "completions/mean_terminated_length": 283.71875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.12686962978678265, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.02405042969621718, + "learning_rate": 7.522e-06, + "loss": 0.0576, + "num_tokens": 55530359.0, + "reward": 3.064527988433838, + "reward_std": 0.7311015725135803, + "rewards/reward_fn/mean": 3.064527988433838, + "rewards/reward_fn/std": 0.7311015725135803, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 494.71875, + "completions/mean_terminated_length": 494.71875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.12697570807255754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.03502373583614826, + "learning_rate": 7.5216e-06, + "loss": -0.0091, + "num_tokens": 55589870.0, + "reward": 2.738553524017334, + "reward_std": 0.32435670495033264, + "rewards/reward_fn/mean": 2.738553524017334, + "rewards/reward_fn/std": 0.32435664534568787, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 295.40625, + "completions/mean_terminated_length": 295.40625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.12708178635833245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.021748598664999008, + "learning_rate": 7.5212e-06, + "loss": 0.0009, + "num_tokens": 55637723.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 188.15625, + "completions/mean_terminated_length": 188.15625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.12718786464410736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.032654849579557776, + "learning_rate": 7.5208e-06, + "loss": 0.0252, + "num_tokens": 55684416.0, + "reward": 3.786581039428711, + "reward_std": 0.40999314188957214, + "rewards/reward_fn/mean": 3.786581039428711, + "rewards/reward_fn/std": 0.40999317169189453, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 433.5, + "completions/mean_terminated_length": 381.4193420410156, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.12729394292988225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.02767302538268268, + "learning_rate": 7.5204e-06, + "loss": 0.1638, + "num_tokens": 55738768.0, + "reward": 2.4253549575805664, + "reward_std": 0.6796978712081909, + "rewards/reward_fn/mean": 2.4253549575805664, + "rewards/reward_fn/std": 0.6796978712081909, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.12740002121565716, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.023716250201687217, + "learning_rate": 7.519999999999999e-06, + "loss": 0.03, + "num_tokens": 55773892.0, + "reward": 3.820127487182617, + "reward_std": 0.5937113165855408, + "rewards/reward_fn/mean": 3.820127487182617, + "rewards/reward_fn/std": 0.5937113761901855, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.12750609950143205, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.026854999363422394, + "learning_rate": 7.519599999999999e-06, + "loss": 0.0397, + "num_tokens": 55812132.0, + "reward": 2.892002582550049, + "reward_std": 0.21918649971485138, + "rewards/reward_fn/mean": 2.892002582550049, + "rewards/reward_fn/std": 0.21918649971485138, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 115.59375, + "completions/mean_terminated_length": 115.59375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.12761217778720696, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.028977030655369163, + "learning_rate": 7.519199999999999e-06, + "loss": 0.0267, + "num_tokens": 55845655.0, + "reward": 3.7437262535095215, + "reward_std": 0.24018844962120056, + "rewards/reward_fn/mean": 3.7437262535095215, + "rewards/reward_fn/std": 0.24018843472003937, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 576.625, + "completions/mean_terminated_length": 576.625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.12771825607298187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.021841128589585423, + "learning_rate": 7.518799999999999e-06, + "loss": -0.0479, + "num_tokens": 55903275.0, + "reward": 2.7085070610046387, + "reward_std": 0.27952468395233154, + "rewards/reward_fn/mean": 2.7085070610046387, + "rewards/reward_fn/std": 0.2795247435569763, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 250.96875, + "completions/mean_terminated_length": 250.96875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.12782433435875676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.02591366902925074, + "learning_rate": 7.518399999999999e-06, + "loss": -0.0142, + "num_tokens": 55954634.0, + "reward": 3.8671493530273438, + "reward_std": 0.4651517868041992, + "rewards/reward_fn/mean": 3.8671493530273438, + "rewards/reward_fn/std": 0.46515172719955444, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 159.65625, + "completions/mean_terminated_length": 159.65625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.12793041264453167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.031389163341373205, + "learning_rate": 7.518e-06, + "loss": -0.0059, + "num_tokens": 55992159.0, + "reward": 3.941786527633667, + "reward_std": 0.22907233238220215, + "rewards/reward_fn/mean": 3.941786527633667, + "rewards/reward_fn/std": 0.22907236218452454, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1829.0, + "completions/max_terminated_length": 1829.0, + "completions/mean_length": 441.4375, + "completions/mean_terminated_length": 441.4375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.12803649093030656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.021549456054344773, + "learning_rate": 7.5176e-06, + "loss": -0.0619, + "num_tokens": 56038477.0, + "reward": 1.9434212446212769, + "reward_std": 0.4184891879558563, + "rewards/reward_fn/mean": 1.9434212446212769, + "rewards/reward_fn/std": 0.41848915815353394, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 252.65625, + "completions/mean_terminated_length": 252.65625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.12814256921608147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.022802669554948807, + "learning_rate": 7.5172e-06, + "loss": 0.0009, + "num_tokens": 56080226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 135.28125, + "completions/mean_terminated_length": 135.28125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.12824864750185638, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.029397084843367338, + "learning_rate": 7.5168e-06, + "loss": 0.0055, + "num_tokens": 56099915.0, + "reward": 3.7214303016662598, + "reward_std": 0.748846709728241, + "rewards/reward_fn/mean": 3.7214303016662598, + "rewards/reward_fn/std": 0.748846709728241, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 223.21875, + "completions/mean_terminated_length": 223.21875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.12835472578763127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.019289852352812886, + "learning_rate": 7.5164e-06, + "loss": 0.0008, + "num_tokens": 56141554.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 383.5, + "completions/mean_terminated_length": 383.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.12846080407340618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.021108913468196988, + "learning_rate": 7.516e-06, + "loss": 0.1666, + "num_tokens": 56186946.0, + "reward": 2.6080212593078613, + "reward_std": 0.37626245617866516, + "rewards/reward_fn/mean": 2.6080212593078613, + "rewards/reward_fn/std": 0.37626245617866516, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 310.46875, + "completions/mean_terminated_length": 310.46875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.12856688235918107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.031106388196349144, + "learning_rate": 7.5156e-06, + "loss": 0.0356, + "num_tokens": 56225809.0, + "reward": 3.812455654144287, + "reward_std": 0.5331630110740662, + "rewards/reward_fn/mean": 3.812455654144287, + "rewards/reward_fn/std": 0.5331630110740662, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 234.3125, + "completions/mean_terminated_length": 234.3125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.12867296064495598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.02681015362031758, + "learning_rate": 7.5152e-06, + "loss": 0.0011, + "num_tokens": 56271995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.12877903893073087, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.02213688869960606, + "learning_rate": 7.5148e-06, + "loss": -0.0091, + "num_tokens": 56311767.0, + "reward": 3.972648859024048, + "reward_std": 0.15472157299518585, + "rewards/reward_fn/mean": 3.972648859024048, + "rewards/reward_fn/std": 0.15472158789634705, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.12888511721650578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.021893349941819906, + "learning_rate": 7.5144e-06, + "loss": 0.0009, + "num_tokens": 56351543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 373.15625, + "completions/mean_terminated_length": 373.15625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.1289911955022807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.018810822628438473, + "learning_rate": 7.5139999999999995e-06, + "loss": 0.023, + "num_tokens": 56409276.0, + "reward": 2.859402656555176, + "reward_std": 0.30087754130363464, + "rewards/reward_fn/mean": 2.859402656555176, + "rewards/reward_fn/std": 0.30087754130363464, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1885.0, + "completions/mean_length": 824.4375, + "completions/mean_terminated_length": 784.9677124023438, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.12909727378805558, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.019085350446403027, + "learning_rate": 7.5135999999999995e-06, + "loss": 0.0753, + "num_tokens": 56473194.0, + "reward": 2.612720012664795, + "reward_std": 0.8761619925498962, + "rewards/reward_fn/mean": 2.612720012664795, + "rewards/reward_fn/std": 0.8761619329452515, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 309.84375, + "completions/mean_terminated_length": 309.84375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1292033520738305, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.025362204294651747, + "learning_rate": 7.5132e-06, + "loss": 0.1876, + "num_tokens": 56515589.0, + "reward": 3.943398952484131, + "reward_std": 0.22278046607971191, + "rewards/reward_fn/mean": 3.943398952484131, + "rewards/reward_fn/std": 0.22278045117855072, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 458.5625, + "completions/mean_terminated_length": 458.5625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.12930943035960538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.019134088302962482, + "learning_rate": 7.5128e-06, + "loss": 0.0589, + "num_tokens": 56561207.0, + "reward": 2.810291051864624, + "reward_std": 0.20179764926433563, + "rewards/reward_fn/mean": 2.810291051864624, + "rewards/reward_fn/std": 0.20179766416549683, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 123.375, + "completions/mean_terminated_length": 123.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.1294155086453803, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.031952549470588565, + "learning_rate": 7.5124e-06, + "loss": 0.0337, + "num_tokens": 56600451.0, + "reward": 2.794665813446045, + "reward_std": 0.021768808364868164, + "rewards/reward_fn/mean": 2.794665813446045, + "rewards/reward_fn/std": 0.02176877297461033, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 154.96875, + "completions/mean_terminated_length": 154.96875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.1295215869311552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.02546194172464311, + "learning_rate": 7.511999999999999e-06, + "loss": 0.001, + "num_tokens": 56625634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 224.4375, + "completions/mean_terminated_length": 224.4375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1296276652169301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.02249453659169376, + "learning_rate": 7.511599999999999e-06, + "loss": 0.0009, + "num_tokens": 56664080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 250.90625, + "completions/mean_terminated_length": 250.90625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.129733743502705, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0323591826017946, + "learning_rate": 7.511199999999999e-06, + "loss": -0.0297, + "num_tokens": 56708525.0, + "reward": 2.691908359527588, + "reward_std": 0.03684841841459274, + "rewards/reward_fn/mean": 2.691908359527588, + "rewards/reward_fn/std": 0.03684840723872185, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 304.40625, + "completions/mean_terminated_length": 304.40625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.1298398217884799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.02477300027385354, + "learning_rate": 7.510799999999999e-06, + "loss": 0.0578, + "num_tokens": 56751418.0, + "reward": 3.2451024055480957, + "reward_std": 0.5951552391052246, + "rewards/reward_fn/mean": 3.2451024055480957, + "rewards/reward_fn/std": 0.5951551795005798, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.1299459000742548, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.02321016346104443, + "learning_rate": 7.510399999999999e-06, + "loss": 0.0499, + "num_tokens": 56774074.0, + "reward": 3.972011089324951, + "reward_std": 0.1583283692598343, + "rewards/reward_fn/mean": 3.972011089324951, + "rewards/reward_fn/std": 0.1583283543586731, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 232.59375, + "completions/mean_terminated_length": 232.59375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1300519783600297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.027153413044288754, + "learning_rate": 7.509999999999999e-06, + "loss": 0.0753, + "num_tokens": 56813293.0, + "reward": 2.9659242630004883, + "reward_std": 0.0341903492808342, + "rewards/reward_fn/mean": 2.9659242630004883, + "rewards/reward_fn/std": 0.0341903492808342, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 396.53125, + "completions/mean_terminated_length": 396.53125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.1301580566458046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.022763823391869664, + "learning_rate": 7.509599999999999e-06, + "loss": 0.0602, + "num_tokens": 56865662.0, + "reward": 3.668447494506836, + "reward_std": 0.6870428919792175, + "rewards/reward_fn/mean": 3.668447494506836, + "rewards/reward_fn/std": 0.6870428323745728, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 259.125, + "completions/mean_terminated_length": 259.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.1302641349315795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02840983378700912, + "learning_rate": 7.509199999999999e-06, + "loss": 0.001, + "num_tokens": 56907266.0, + "reward": 3.0267953872680664, + "reward_std": 0.18145422637462616, + "rewards/reward_fn/mean": 3.0267953872680664, + "rewards/reward_fn/std": 0.18145416676998138, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 509.75, + "completions/mean_terminated_length": 509.75, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.1303702132173544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.021488810423761606, + "learning_rate": 7.508799999999999e-06, + "loss": -0.0026, + "num_tokens": 56957146.0, + "reward": 2.784040927886963, + "reward_std": 0.219995379447937, + "rewards/reward_fn/mean": 2.784040927886963, + "rewards/reward_fn/std": 0.21999536454677582, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1945.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 609.0, + "completions/mean_terminated_length": 609.0, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.1304762915031293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.022416489897295833, + "learning_rate": 7.5084e-06, + "loss": 0.1618, + "num_tokens": 57012602.0, + "reward": 3.59625244140625, + "reward_std": 0.8699040412902832, + "rewards/reward_fn/mean": 3.59625244140625, + "rewards/reward_fn/std": 0.8699040412902832, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 319.96875, + "completions/mean_terminated_length": 319.96875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.13058236978890422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.025042463559657335, + "learning_rate": 7.508e-06, + "loss": 0.003, + "num_tokens": 57064185.0, + "reward": 3.126904010772705, + "reward_std": 0.5139070749282837, + "rewards/reward_fn/mean": 3.126904010772705, + "rewards/reward_fn/std": 0.5139070749282837, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 170.34375, + "completions/mean_terminated_length": 170.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1306884480746791, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.024505676236003637, + "learning_rate": 7.5076e-06, + "loss": 0.07, + "num_tokens": 57102692.0, + "reward": 3.0724833011627197, + "reward_std": 0.03876578435301781, + "rewards/reward_fn/mean": 3.0724833011627197, + "rewards/reward_fn/std": 0.03876576945185661, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 105.03125, + "completions/mean_terminated_length": 105.03125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.13079452636045402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.01858626154717058, + "learning_rate": 7.5072e-06, + "loss": 0.0007, + "num_tokens": 57171781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 90.375, + "completions/mean_terminated_length": 90.375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.1309006046462289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.027592405676841736, + "learning_rate": 7.5068e-06, + "loss": 0.0011, + "num_tokens": 57202545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 732.09375, + "completions/mean_terminated_length": 732.09375, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.13100668293200382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.019199739210307598, + "learning_rate": 7.5064e-06, + "loss": 0.0475, + "num_tokens": 57265332.0, + "reward": 2.7055556774139404, + "reward_std": 0.27973470091819763, + "rewards/reward_fn/mean": 2.7055556774139404, + "rewards/reward_fn/std": 0.27973467111587524, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 364.5, + "completions/mean_terminated_length": 364.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.13111276121777873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.028548541711643338, + "learning_rate": 7.506e-06, + "loss": -0.02, + "num_tokens": 57320836.0, + "reward": 2.947634696960449, + "reward_std": 0.29028210043907166, + "rewards/reward_fn/mean": 2.947634696960449, + "rewards/reward_fn/std": 0.29028213024139404, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.13121883950355362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.029802830889821053, + "learning_rate": 7.5056e-06, + "loss": 0.0012, + "num_tokens": 57362975.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 424.59375, + "completions/mean_terminated_length": 424.59375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.13132491778932853, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.021809349535033107, + "learning_rate": 7.5052e-06, + "loss": -0.0274, + "num_tokens": 57420690.0, + "reward": 3.653506278991699, + "reward_std": 0.635819137096405, + "rewards/reward_fn/mean": 3.653506278991699, + "rewards/reward_fn/std": 0.6358190774917603, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 248.40625, + "completions/mean_terminated_length": 248.40625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.13143099607510342, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.026568999979645014, + "learning_rate": 7.5048e-06, + "loss": 0.054, + "num_tokens": 57458239.0, + "reward": 2.9668123722076416, + "reward_std": 0.20671309530735016, + "rewards/reward_fn/mean": 2.9668123722076416, + "rewards/reward_fn/std": 0.20671308040618896, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 183.4375, + "completions/mean_terminated_length": 183.4375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.13153707436087833, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.02611563727259636, + "learning_rate": 7.5044e-06, + "loss": 0.1308, + "num_tokens": 57515629.0, + "reward": 3.3227896690368652, + "reward_std": 0.5340291857719421, + "rewards/reward_fn/mean": 3.3227896690368652, + "rewards/reward_fn/std": 0.5340291857719421, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 172.84375, + "completions/mean_terminated_length": 172.84375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.13164315264665322, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.02926379698328674, + "learning_rate": 7.503999999999999e-06, + "loss": 0.0133, + "num_tokens": 57562088.0, + "reward": 3.874748706817627, + "reward_std": 0.452591210603714, + "rewards/reward_fn/mean": 3.874748706817627, + "rewards/reward_fn/std": 0.452591210603714, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 212.15625, + "completions/mean_terminated_length": 212.15625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.13174923093242813, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.03235385986045003, + "learning_rate": 7.5036e-06, + "loss": 0.0707, + "num_tokens": 57610029.0, + "reward": 3.885643720626831, + "reward_std": 0.3627666234970093, + "rewards/reward_fn/mean": 3.885643720626831, + "rewards/reward_fn/std": 0.3627666234970093, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 711.53125, + "completions/mean_terminated_length": 668.4193115234375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.13185530921820304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.019830133765935898, + "learning_rate": 7.5032e-06, + "loss": 0.2383, + "num_tokens": 57672158.0, + "reward": 2.4883432388305664, + "reward_std": 0.572623610496521, + "rewards/reward_fn/mean": 2.4883432388305664, + "rewards/reward_fn/std": 0.5726235508918762, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 352.71875, + "completions/mean_terminated_length": 352.71875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.13196138750397793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.02547502121888101, + "learning_rate": 7.5027999999999996e-06, + "loss": 0.0176, + "num_tokens": 57717909.0, + "reward": 2.8250460624694824, + "reward_std": 0.2939043641090393, + "rewards/reward_fn/mean": 2.8250460624694824, + "rewards/reward_fn/std": 0.2939044237136841, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 167.5625, + "completions/mean_terminated_length": 167.5625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.13206746578975284, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "kl": 0.04957319051027298, + "learning_rate": 7.5023999999999995e-06, + "loss": 0.1431, + "num_tokens": 57772551.0, + "reward": 3.9671456813812256, + "reward_std": 0.1858520656824112, + "rewards/reward_fn/mean": 3.9671456813812256, + "rewards/reward_fn/std": 0.1858520358800888, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 408.5625, + "completions/mean_terminated_length": 408.5625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.13217354407552773, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.028883446706458926, + "learning_rate": 7.5019999999999995e-06, + "loss": 0.0568, + "num_tokens": 57820825.0, + "reward": 2.7001430988311768, + "reward_std": 0.3382115960121155, + "rewards/reward_fn/mean": 2.7001430988311768, + "rewards/reward_fn/std": 0.33821165561676025, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1814.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 494.78125, + "completions/mean_terminated_length": 494.78125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.13227962236130264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.018464128370396793, + "learning_rate": 7.5015999999999995e-06, + "loss": -0.1013, + "num_tokens": 57858418.0, + "reward": 3.8081459999084473, + "reward_std": 0.5380722284317017, + "rewards/reward_fn/mean": 3.8081459999084473, + "rewards/reward_fn/std": 0.5380722284317017, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 79.84375, + "completions/mean_terminated_length": 79.84375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.13238570064707755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.02420707419514656, + "learning_rate": 7.5011999999999994e-06, + "loss": 0.001, + "num_tokens": 57898733.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 336.15625, + "completions/mean_terminated_length": 336.15625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.13249177893285244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.02854512631893158, + "learning_rate": 7.500799999999999e-06, + "loss": -0.0838, + "num_tokens": 57938002.0, + "reward": 2.6425585746765137, + "reward_std": 0.25894415378570557, + "rewards/reward_fn/mean": 2.6425585746765137, + "rewards/reward_fn/std": 0.25894415378570557, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 294.5, + "completions/mean_terminated_length": 294.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.13259785721862735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.02842319617047906, + "learning_rate": 7.500399999999999e-06, + "loss": 0.063, + "num_tokens": 57993954.0, + "reward": 3.9350175857543945, + "reward_std": 0.2557204067707062, + "rewards/reward_fn/mean": 3.9350175857543945, + "rewards/reward_fn/std": 0.2557204067707062, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 103.03125, + "completions/mean_terminated_length": 103.03125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.13270393550440224, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.02735895407386124, + "learning_rate": 7.499999999999999e-06, + "loss": 0.0611, + "num_tokens": 58034563.0, + "reward": 3.108828544616699, + "reward_std": 0.05664081871509552, + "rewards/reward_fn/mean": 3.108828544616699, + "rewards/reward_fn/std": 0.056640833616256714, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 227.5625, + "completions/mean_terminated_length": 227.5625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.13281001379017715, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.027604984818026423, + "learning_rate": 7.499599999999999e-06, + "loss": 0.1806, + "num_tokens": 58079445.0, + "reward": 3.8578529357910156, + "reward_std": 0.475358247756958, + "rewards/reward_fn/mean": 3.8578529357910156, + "rewards/reward_fn/std": 0.4753582775592804, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 311.09375, + "completions/mean_terminated_length": 311.09375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.13291609207595206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.01987571455538273, + "learning_rate": 7.4992e-06, + "loss": 0.0234, + "num_tokens": 58131000.0, + "reward": 3.4816336631774902, + "reward_std": 0.716206431388855, + "rewards/reward_fn/mean": 3.4816336631774902, + "rewards/reward_fn/std": 0.716206431388855, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 362.34375, + "completions/mean_terminated_length": 362.34375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.13302217036172695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.027468676446005702, + "learning_rate": 7.4988e-06, + "loss": 0.014, + "num_tokens": 58186275.0, + "reward": 3.3233084678649902, + "reward_std": 0.533751368522644, + "rewards/reward_fn/mean": 3.3233084678649902, + "rewards/reward_fn/std": 0.533751368522644, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 285.59375, + "completions/mean_terminated_length": 285.59375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.13312824864750186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.02612379170022905, + "learning_rate": 7.4984e-06, + "loss": -0.0235, + "num_tokens": 58236950.0, + "reward": 3.6018028259277344, + "reward_std": 0.6388559341430664, + "rewards/reward_fn/mean": 3.6018028259277344, + "rewards/reward_fn/std": 0.6388558745384216, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 199.28125, + "completions/mean_terminated_length": 199.28125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.13323432693327675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.027978347148746252, + "learning_rate": 7.498e-06, + "loss": 0.0033, + "num_tokens": 58273119.0, + "reward": 2.9400501251220703, + "reward_std": 0.0498286671936512, + "rewards/reward_fn/mean": 2.9400501251220703, + "rewards/reward_fn/std": 0.04982864111661911, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 166.90625, + "completions/mean_terminated_length": 166.90625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.13334040521905166, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.027146222535520792, + "learning_rate": 7.4976e-06, + "loss": -0.0098, + "num_tokens": 58309596.0, + "reward": 2.928208112716675, + "reward_std": 0.5002910494804382, + "rewards/reward_fn/mean": 2.928208112716675, + "rewards/reward_fn/std": 0.5002910494804382, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 172.46875, + "completions/mean_terminated_length": 172.46875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.13344648350482657, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.026685474440455437, + "learning_rate": 7.4972e-06, + "loss": 0.1586, + "num_tokens": 58352171.0, + "reward": 3.931187629699707, + "reward_std": 0.3892618715763092, + "rewards/reward_fn/mean": 3.931187629699707, + "rewards/reward_fn/std": 0.3892618715763092, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 243.625, + "completions/mean_terminated_length": 243.625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.13355256179060146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.0268127650488168, + "learning_rate": 7.4968e-06, + "loss": -0.1437, + "num_tokens": 58390175.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.13365864007637637, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.03086657985113561, + "learning_rate": 7.4964e-06, + "loss": 0.0395, + "num_tokens": 58439379.0, + "reward": 3.4151363372802734, + "reward_std": 0.5244680643081665, + "rewards/reward_fn/mean": 3.4151363372802734, + "rewards/reward_fn/std": 0.5244680643081665, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.0, + "completions/max_terminated_length": 1492.0, + "completions/mean_length": 280.65625, + "completions/mean_terminated_length": 280.65625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.13376471836215126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.0308777317404747, + "learning_rate": 7.496e-06, + "loss": 0.0012, + "num_tokens": 58485320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 236.625, + "completions/mean_terminated_length": 236.625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.13387079664792617, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.026496544247493148, + "learning_rate": 7.495599999999999e-06, + "loss": 0.0164, + "num_tokens": 58542108.0, + "reward": 3.928636074066162, + "reward_std": 0.403695672750473, + "rewards/reward_fn/mean": 3.928636074066162, + "rewards/reward_fn/std": 0.403695672750473, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 192.34375, + "completions/mean_terminated_length": 192.34375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.13397687493370108, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.02878828765824437, + "learning_rate": 7.495199999999999e-06, + "loss": 0.0331, + "num_tokens": 58591783.0, + "reward": 3.8793630599975586, + "reward_std": 0.3811015188694, + "rewards/reward_fn/mean": 3.8793630599975586, + "rewards/reward_fn/std": 0.3811015486717224, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 282.125, + "completions/mean_terminated_length": 282.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.13408295321947597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.024802331812679768, + "learning_rate": 7.494799999999999e-06, + "loss": 0.001, + "num_tokens": 58633995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 201.53125, + "completions/mean_terminated_length": 201.53125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.13418903150525088, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.03025827999226749, + "learning_rate": 7.4944e-06, + "loss": 0.0333, + "num_tokens": 58664124.0, + "reward": 3.811546564102173, + "reward_std": 0.534115731716156, + "rewards/reward_fn/mean": 3.811546564102173, + "rewards/reward_fn/std": 0.534115731716156, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 377.78125, + "completions/mean_terminated_length": 377.78125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.13429510979102577, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.03353354521095753, + "learning_rate": 7.494e-06, + "loss": 0.0983, + "num_tokens": 58715669.0, + "reward": 2.4701292514801025, + "reward_std": 0.6296804547309875, + "rewards/reward_fn/mean": 2.4701292514801025, + "rewards/reward_fn/std": 0.6296803951263428, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 308.3125, + "completions/mean_terminated_length": 308.3125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.13440118807680068, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.026489258743822575, + "learning_rate": 7.4936e-06, + "loss": 0.0136, + "num_tokens": 58760319.0, + "reward": 3.2096259593963623, + "reward_std": 0.4842040240764618, + "rewards/reward_fn/mean": 3.2096259593963623, + "rewards/reward_fn/std": 0.4842039942741394, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 148.53125, + "completions/mean_terminated_length": 148.53125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.13450726636257557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.02966041606850922, + "learning_rate": 7.4932e-06, + "loss": 0.0012, + "num_tokens": 58790704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 283.28125, + "completions/mean_terminated_length": 283.28125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.13461334464835048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.030821983935311437, + "learning_rate": 7.4928e-06, + "loss": -0.0253, + "num_tokens": 58820473.0, + "reward": 3.7869110107421875, + "reward_std": 0.49617645144462585, + "rewards/reward_fn/mean": 3.7869110107421875, + "rewards/reward_fn/std": 0.49617645144462585, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 303.4375, + "completions/mean_terminated_length": 303.4375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1347194229341254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.01821422518696636, + "learning_rate": 7.4924e-06, + "loss": 0.0152, + "num_tokens": 58866439.0, + "reward": 2.632956027984619, + "reward_std": 0.3445678651332855, + "rewards/reward_fn/mean": 2.632956027984619, + "rewards/reward_fn/std": 0.3445678651332855, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 402.5625, + "completions/mean_terminated_length": 402.5625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.13482550121990028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.02504706336185336, + "learning_rate": 7.492e-06, + "loss": -0.0248, + "num_tokens": 58941273.0, + "reward": 3.4776172637939453, + "reward_std": 0.641873836517334, + "rewards/reward_fn/mean": 3.4776172637939453, + "rewards/reward_fn/std": 0.641873836517334, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 144.8125, + "completions/mean_terminated_length": 144.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1349315795056752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.02622751286253333, + "learning_rate": 7.4915999999999996e-06, + "loss": 0.001, + "num_tokens": 58969779.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 367.40625, + "completions/mean_terminated_length": 367.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.13503765779145008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.024884506594389677, + "learning_rate": 7.4911999999999995e-06, + "loss": -0.0294, + "num_tokens": 59026048.0, + "reward": 2.75225830078125, + "reward_std": 0.27699360251426697, + "rewards/reward_fn/mean": 2.75225830078125, + "rewards/reward_fn/std": 0.27699360251426697, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 177.15625, + "completions/mean_terminated_length": 177.15625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.135143736077225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02919724863022566, + "learning_rate": 7.4907999999999995e-06, + "loss": 0.0012, + "num_tokens": 59062149.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 512.53125, + "completions/mean_terminated_length": 512.53125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.1352498143629999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.017439111368730664, + "learning_rate": 7.4903999999999995e-06, + "loss": -0.0313, + "num_tokens": 59114102.0, + "reward": 2.5705909729003906, + "reward_std": 0.33944466710090637, + "rewards/reward_fn/mean": 2.5705909729003906, + "rewards/reward_fn/std": 0.33944469690322876, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1666.0, + "completions/max_terminated_length": 1666.0, + "completions/mean_length": 330.25, + "completions/mean_terminated_length": 330.25, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1353558926487748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.025731251807883382, + "learning_rate": 7.49e-06, + "loss": 0.0415, + "num_tokens": 59176222.0, + "reward": 3.0980734825134277, + "reward_std": 0.9913315176963806, + "rewards/reward_fn/mean": 3.0980734825134277, + "rewards/reward_fn/std": 0.9913315176963806, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 137.84375, + "completions/mean_terminated_length": 137.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.1354619709345497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.025162109872326255, + "learning_rate": 7.4896e-06, + "loss": 0.001, + "num_tokens": 59223577.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 290.25, + "completions/mean_terminated_length": 290.25, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1355680492203246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.024209644412621856, + "learning_rate": 7.4892e-06, + "loss": 0.0241, + "num_tokens": 59251009.0, + "reward": 2.991244077682495, + "reward_std": 0.04524612799286842, + "rewards/reward_fn/mean": 2.991244077682495, + "rewards/reward_fn/std": 0.04524614289402962, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.1356741275060995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.023455350194126368, + "learning_rate": 7.4888e-06, + "loss": 0.0716, + "num_tokens": 59296689.0, + "reward": 3.328552722930908, + "reward_std": 0.5668790936470032, + "rewards/reward_fn/mean": 3.328552722930908, + "rewards/reward_fn/std": 0.5668790936470032, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 204.96875, + "completions/mean_terminated_length": 204.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.13578020579187441, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.024695158703252673, + "learning_rate": 7.4884e-06, + "loss": 0.0307, + "num_tokens": 59339440.0, + "reward": 3.457145929336548, + "reward_std": 0.5878257751464844, + "rewards/reward_fn/mean": 3.457145929336548, + "rewards/reward_fn/std": 0.5878257155418396, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.1358862840776493, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.025261733680963516, + "learning_rate": 7.488e-06, + "loss": -0.0178, + "num_tokens": 59368871.0, + "reward": 2.85054349899292, + "reward_std": 0.06259602308273315, + "rewards/reward_fn/mean": 2.85054349899292, + "rewards/reward_fn/std": 0.06259601563215256, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 172.75, + "completions/mean_terminated_length": 172.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.1359923623634242, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.02319705649279058, + "learning_rate": 7.487599999999999e-06, + "loss": 0.038, + "num_tokens": 59405055.0, + "reward": 3.5303797721862793, + "reward_std": 0.5783197283744812, + "rewards/reward_fn/mean": 3.5303797721862793, + "rewards/reward_fn/std": 0.5783197283744812, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 354.125, + "completions/mean_terminated_length": 354.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.1360984406491991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.03816063771955669, + "learning_rate": 7.487199999999999e-06, + "loss": 0.0339, + "num_tokens": 59451427.0, + "reward": 2.811558723449707, + "reward_std": 1.0576351881027222, + "rewards/reward_fn/mean": 2.811558723449707, + "rewards/reward_fn/std": 1.0576351881027222, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 219.96875, + "completions/mean_terminated_length": 219.96875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.136204518934974, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.018938904628157616, + "learning_rate": 7.486799999999999e-06, + "loss": 0.0891, + "num_tokens": 59491266.0, + "reward": 3.9295401573181152, + "reward_std": 0.3985815644264221, + "rewards/reward_fn/mean": 3.9295401573181152, + "rewards/reward_fn/std": 0.39858150482177734, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 205.3125, + "completions/mean_terminated_length": 205.3125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.13631059722074892, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.025979549856856465, + "learning_rate": 7.486399999999999e-06, + "loss": 0.1455, + "num_tokens": 59569580.0, + "reward": 3.9671754837036133, + "reward_std": 0.1856841892004013, + "rewards/reward_fn/mean": 3.9671754837036133, + "rewards/reward_fn/std": 0.1856841742992401, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 118.34375, + "completions/mean_terminated_length": 118.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.1364166755065238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.025476978393271565, + "learning_rate": 7.485999999999999e-06, + "loss": 0.001, + "num_tokens": 59606135.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 259.28125, + "completions/mean_terminated_length": 259.28125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.13652275379229872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.028418211033567786, + "learning_rate": 7.485599999999999e-06, + "loss": 0.1128, + "num_tokens": 59657664.0, + "reward": 3.1655662059783936, + "reward_std": 0.32365211844444275, + "rewards/reward_fn/mean": 3.1655662059783936, + "rewards/reward_fn/std": 0.32365208864212036, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 261.03125, + "completions/mean_terminated_length": 261.03125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1366288320780736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.027075995225459337, + "learning_rate": 7.4852e-06, + "loss": 0.0743, + "num_tokens": 59699745.0, + "reward": 3.0787363052368164, + "reward_std": 0.5415990948677063, + "rewards/reward_fn/mean": 3.0787363052368164, + "rewards/reward_fn/std": 0.5415990948677063, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 228.78125, + "completions/mean_terminated_length": 228.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.13673491036384852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0208129589445889, + "learning_rate": 7.4848e-06, + "loss": 0.0565, + "num_tokens": 59754874.0, + "reward": 2.737701177597046, + "reward_std": 0.17712976038455963, + "rewards/reward_fn/mean": 2.737701177597046, + "rewards/reward_fn/std": 0.17712976038455963, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 359.625, + "completions/mean_terminated_length": 359.625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.13684098864962344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.019323077285662293, + "learning_rate": 7.4844e-06, + "loss": 0.0008, + "num_tokens": 59804078.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 244.53125, + "completions/mean_terminated_length": 244.53125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.13694706693539832, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.02193894237279892, + "learning_rate": 7.484e-06, + "loss": -0.012, + "num_tokens": 59849343.0, + "reward": 3.662727117538452, + "reward_std": 0.8044856190681458, + "rewards/reward_fn/mean": 3.662727117538452, + "rewards/reward_fn/std": 0.804485559463501, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 101.15625, + "completions/mean_terminated_length": 101.15625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.13705314522117323, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.024025865364819765, + "learning_rate": 7.4836e-06, + "loss": 0.083, + "num_tokens": 59890820.0, + "reward": 3.874630928039551, + "reward_std": 0.3960571885108948, + "rewards/reward_fn/mean": 3.874630928039551, + "rewards/reward_fn/std": 0.3960571885108948, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 192.78125, + "completions/mean_terminated_length": 192.78125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.13715922350694812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.022730932221747935, + "learning_rate": 7.4832e-06, + "loss": 0.0009, + "num_tokens": 59930685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 175.3125, + "completions/mean_terminated_length": 175.3125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.13726530179272303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.024798734579235315, + "learning_rate": 7.4828e-06, + "loss": 0.001, + "num_tokens": 59966855.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 208.1875, + "completions/mean_terminated_length": 208.1875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.13737138007849792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.022808075416833162, + "learning_rate": 7.4824e-06, + "loss": 0.0435, + "num_tokens": 60016717.0, + "reward": 1.9026248455047607, + "reward_std": 0.4392467141151428, + "rewards/reward_fn/mean": 1.9026248455047607, + "rewards/reward_fn/std": 0.43924665451049805, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 309.0625, + "completions/mean_terminated_length": 309.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.13747745836427283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.023748031351715326, + "learning_rate": 7.482e-06, + "loss": 0.0011, + "num_tokens": 60077295.0, + "reward": 3.4045333862304688, + "reward_std": 0.6057592630386353, + "rewards/reward_fn/mean": 3.4045333862304688, + "rewards/reward_fn/std": 0.60575932264328, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.13758353665004774, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.03300858614966273, + "learning_rate": 7.4816e-06, + "loss": 0.0822, + "num_tokens": 60100131.0, + "reward": 2.8976831436157227, + "reward_std": 0.07260072231292725, + "rewards/reward_fn/mean": 2.8976831436157227, + "rewards/reward_fn/std": 0.07260074466466904, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 478.875, + "completions/mean_terminated_length": 478.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.13768961493582263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.02038547326810658, + "learning_rate": 7.4812e-06, + "loss": 0.0507, + "num_tokens": 60155679.0, + "reward": 2.7392239570617676, + "reward_std": 0.3335033655166626, + "rewards/reward_fn/mean": 2.7392239570617676, + "rewards/reward_fn/std": 0.3335033059120178, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 351.28125, + "completions/mean_terminated_length": 351.28125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.13779569322159754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.017921574064530432, + "learning_rate": 7.4808e-06, + "loss": -0.0526, + "num_tokens": 60211464.0, + "reward": 2.7742135524749756, + "reward_std": 0.037488196045160294, + "rewards/reward_fn/mean": 2.7742135524749756, + "rewards/reward_fn/std": 0.037488240748643875, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 391.25, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.13790177150737243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.022719235508702695, + "learning_rate": 7.4804e-06, + "loss": 0.0432, + "num_tokens": 60258448.0, + "reward": 3.5938987731933594, + "reward_std": 0.6974779367446899, + "rewards/reward_fn/mean": 3.5938987731933594, + "rewards/reward_fn/std": 0.6974778771400452, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 94.6875, + "completions/mean_terminated_length": 94.6875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.13800784979314734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.027633204823359847, + "learning_rate": 7.48e-06, + "loss": 0.0011, + "num_tokens": 60320838.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 201.4375, + "completions/mean_terminated_length": 201.4375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.13811392807892225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.029265858931466937, + "learning_rate": 7.4795999999999995e-06, + "loss": 0.0012, + "num_tokens": 60363828.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 116.25, + "completions/mean_terminated_length": 116.25, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.13822000636469714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.026726818876340985, + "learning_rate": 7.4791999999999995e-06, + "loss": 0.0011, + "num_tokens": 60398460.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 111.375, + "completions/mean_terminated_length": 111.375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.13832608465047205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.02441283850930631, + "learning_rate": 7.4787999999999994e-06, + "loss": 0.001, + "num_tokens": 60439816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 100.9375, + "completions/mean_terminated_length": 100.9375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.13843216293624694, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.515625, + "kl": 0.03212942089885473, + "learning_rate": 7.478399999999999e-06, + "loss": -0.0653, + "num_tokens": 60477830.0, + "reward": 2.844021797180176, + "reward_std": 0.21652944386005402, + "rewards/reward_fn/mean": 2.844021797180176, + "rewards/reward_fn/std": 0.21652939915657043, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 373.34375, + "completions/mean_terminated_length": 373.34375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.13853824122202185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.02917315112426877, + "learning_rate": 7.477999999999999e-06, + "loss": -0.0192, + "num_tokens": 60510897.0, + "reward": 3.6083593368530273, + "reward_std": 0.5928052067756653, + "rewards/reward_fn/mean": 3.6083593368530273, + "rewards/reward_fn/std": 0.5928052067756653, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 453.25, + "completions/mean_terminated_length": 401.8064270019531, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.13864431950779676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.02317165257409215, + "learning_rate": 7.477599999999999e-06, + "loss": 0.1673, + "num_tokens": 60555865.0, + "reward": 2.9343154430389404, + "reward_std": 0.6307179927825928, + "rewards/reward_fn/mean": 2.9343154430389404, + "rewards/reward_fn/std": 0.6307179927825928, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 324.1875, + "completions/mean_terminated_length": 324.1875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.13875039779357165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.027127522975206375, + "learning_rate": 7.477199999999999e-06, + "loss": 0.0713, + "num_tokens": 60586719.0, + "reward": 2.7872352600097656, + "reward_std": 0.6089009642601013, + "rewards/reward_fn/mean": 2.7872352600097656, + "rewards/reward_fn/std": 0.6089009642601013, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1908.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 667.625, + "completions/mean_terminated_length": 667.625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.13885647607934656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.01850100071169436, + "learning_rate": 7.476799999999999e-06, + "loss": 0.0534, + "num_tokens": 60646515.0, + "reward": 2.3431787490844727, + "reward_std": 0.45147812366485596, + "rewards/reward_fn/mean": 2.3431787490844727, + "rewards/reward_fn/std": 0.45147812366485596, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 328.8125, + "completions/mean_terminated_length": 328.8125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.13896255436512145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.020504693733528256, + "learning_rate": 7.476399999999999e-06, + "loss": 0.0613, + "num_tokens": 60691949.0, + "reward": 1.6473358869552612, + "reward_std": 0.04363499581813812, + "rewards/reward_fn/mean": 1.6473358869552612, + "rewards/reward_fn/std": 0.04363495483994484, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 226.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.13906863265089636, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.02825233223848045, + "learning_rate": 7.475999999999999e-06, + "loss": -0.0526, + "num_tokens": 60732285.0, + "reward": 2.9058597087860107, + "reward_std": 0.06823138147592545, + "rewards/reward_fn/mean": 2.9058597087860107, + "rewards/reward_fn/std": 0.06823134422302246, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 247.03125, + "completions/mean_terminated_length": 247.03125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.13917471093667128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.021199521608650684, + "learning_rate": 7.4756e-06, + "loss": 0.1534, + "num_tokens": 60771326.0, + "reward": 2.8149824142456055, + "reward_std": 0.04035944491624832, + "rewards/reward_fn/mean": 2.8149824142456055, + "rewards/reward_fn/std": 0.04035947099328041, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 220.71875, + "completions/mean_terminated_length": 220.71875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.13928078922244616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.020940072368830442, + "learning_rate": 7.4752e-06, + "loss": 0.0157, + "num_tokens": 60827989.0, + "reward": 2.911080837249756, + "reward_std": 0.20522548258304596, + "rewards/reward_fn/mean": 2.911080837249756, + "rewards/reward_fn/std": 0.20522546768188477, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 196.90625, + "completions/mean_terminated_length": 196.90625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.13938686750822107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.030910142930224538, + "learning_rate": 7.4748e-06, + "loss": -0.0081, + "num_tokens": 60851026.0, + "reward": 3.170276403427124, + "reward_std": 0.3668881952762604, + "rewards/reward_fn/mean": 3.170276403427124, + "rewards/reward_fn/std": 0.3668881952762604, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.13949294579399596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.025190659100189805, + "learning_rate": 7.4744e-06, + "loss": 0.0444, + "num_tokens": 60897106.0, + "reward": 2.8724751472473145, + "reward_std": 0.056788910180330276, + "rewards/reward_fn/mean": 2.8724751472473145, + "rewards/reward_fn/std": 0.05678891763091087, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 426.65625, + "completions/mean_terminated_length": 374.3548278808594, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.13959902407977087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02236508228816092, + "learning_rate": 7.474e-06, + "loss": 0.1468, + "num_tokens": 60950503.0, + "reward": 2.9462509155273438, + "reward_std": 0.50715571641922, + "rewards/reward_fn/mean": 2.9462509155273438, + "rewards/reward_fn/std": 0.5071556568145752, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 139.5625, + "completions/mean_terminated_length": 139.5625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.13970510236554579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.03153642802499235, + "learning_rate": 7.4736e-06, + "loss": 0.0013, + "num_tokens": 60990713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 201.34375, + "completions/mean_terminated_length": 201.34375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.13981118065132067, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.029545513913035393, + "learning_rate": 7.4732e-06, + "loss": 0.0299, + "num_tokens": 61044836.0, + "reward": 2.9463093280792236, + "reward_std": 0.05201994255185127, + "rewards/reward_fn/mean": 2.9463093280792236, + "rewards/reward_fn/std": 0.052019957453012466, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 283.15625, + "completions/mean_terminated_length": 283.15625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.13991725893709558, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.024044741643592715, + "learning_rate": 7.4728e-06, + "loss": -0.0858, + "num_tokens": 61075273.0, + "reward": 2.9099972248077393, + "reward_std": 0.876579761505127, + "rewards/reward_fn/mean": 2.9099972248077393, + "rewards/reward_fn/std": 0.8765797019004822, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 148.71875, + "completions/mean_terminated_length": 148.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.14002333722287047, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.015515311155468225, + "learning_rate": 7.4724e-06, + "loss": 0.0582, + "num_tokens": 61122720.0, + "reward": 2.7426884174346924, + "reward_std": 0.03458679839968681, + "rewards/reward_fn/mean": 2.7426884174346924, + "rewards/reward_fn/std": 0.034586794674396515, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 128.75, + "completions/mean_terminated_length": 128.75, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.14012941550864538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02550445985980332, + "learning_rate": 7.472e-06, + "loss": 0.001, + "num_tokens": 61180632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.14023549379442027, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.031397337559610605, + "learning_rate": 7.471599999999999e-06, + "loss": 0.0942, + "num_tokens": 61231660.0, + "reward": 3.165767192840576, + "reward_std": 0.5331368446350098, + "rewards/reward_fn/mean": 3.165767192840576, + "rewards/reward_fn/std": 0.533136785030365, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 357.84375, + "completions/mean_terminated_length": 357.84375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.14034157208019518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.02645385661162436, + "learning_rate": 7.471199999999999e-06, + "loss": 0.0052, + "num_tokens": 61274439.0, + "reward": 3.297321319580078, + "reward_std": 1.0589929819107056, + "rewards/reward_fn/mean": 3.297321319580078, + "rewards/reward_fn/std": 1.0589929819107056, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 326.40625, + "completions/mean_terminated_length": 326.40625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.1404476503659701, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.026967764366418123, + "learning_rate": 7.4708e-06, + "loss": -0.0404, + "num_tokens": 61319092.0, + "reward": 3.5227808952331543, + "reward_std": 0.747248113155365, + "rewards/reward_fn/mean": 3.5227808952331543, + "rewards/reward_fn/std": 0.747248113155365, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1725.0, + "completions/max_terminated_length": 1725.0, + "completions/mean_length": 355.0, + "completions/mean_terminated_length": 355.0, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.14055372865174498, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.03049283567816019, + "learning_rate": 7.4704e-06, + "loss": 0.1093, + "num_tokens": 61363092.0, + "reward": 3.722357749938965, + "reward_std": 0.5337069034576416, + "rewards/reward_fn/mean": 3.722357749938965, + "rewards/reward_fn/std": 0.5337069034576416, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 112.28125, + "completions/mean_terminated_length": 112.28125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.1406598069375199, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.021817383356392384, + "learning_rate": 7.47e-06, + "loss": -0.0305, + "num_tokens": 61409597.0, + "reward": 3.9790706634521484, + "reward_std": 0.11839355528354645, + "rewards/reward_fn/mean": 3.9790706634521484, + "rewards/reward_fn/std": 0.11839357018470764, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 197.15625, + "completions/mean_terminated_length": 197.15625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.14076588522329478, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.028691569808870554, + "learning_rate": 7.4696e-06, + "loss": 0.0097, + "num_tokens": 61445026.0, + "reward": 3.0395777225494385, + "reward_std": 0.4235535264015198, + "rewards/reward_fn/mean": 3.0395777225494385, + "rewards/reward_fn/std": 0.4235535264015198, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 205.90625, + "completions/mean_terminated_length": 205.90625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1408719635090697, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.024880185024812818, + "learning_rate": 7.4691999999999996e-06, + "loss": -0.0302, + "num_tokens": 61468447.0, + "reward": 3.9752869606018066, + "reward_std": 0.1397986114025116, + "rewards/reward_fn/mean": 3.9752869606018066, + "rewards/reward_fn/std": 0.1397986114025116, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1409780417948446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.03259215364232659, + "learning_rate": 7.4687999999999995e-06, + "loss": 0.0013, + "num_tokens": 61505507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 284.15625, + "completions/mean_terminated_length": 284.15625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1410841200806195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.0419064718298614, + "learning_rate": 7.4683999999999995e-06, + "loss": -0.0339, + "num_tokens": 61549448.0, + "reward": 2.5116405487060547, + "reward_std": 0.5236536860466003, + "rewards/reward_fn/mean": 2.5116405487060547, + "rewards/reward_fn/std": 0.5236537456512451, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 153.84375, + "completions/mean_terminated_length": 153.84375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.1411901983663944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.025083963526412845, + "learning_rate": 7.4679999999999995e-06, + "loss": -0.0651, + "num_tokens": 61573539.0, + "reward": 3.4854817390441895, + "reward_std": 0.4917638599872589, + "rewards/reward_fn/mean": 3.4854817390441895, + "rewards/reward_fn/std": 0.4917638599872589, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 112.65625, + "completions/mean_terminated_length": 112.65625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.1412962766521693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.02348939247895032, + "learning_rate": 7.4675999999999994e-06, + "loss": 0.0009, + "num_tokens": 61641112.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1583.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 600.40625, + "completions/mean_terminated_length": 600.40625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.1414023549379442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.020444039721041918, + "learning_rate": 7.467199999999999e-06, + "loss": -0.0601, + "num_tokens": 61695973.0, + "reward": 3.559945583343506, + "reward_std": 0.6751914620399475, + "rewards/reward_fn/mean": 3.559945583343506, + "rewards/reward_fn/std": 0.6751914620399475, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 402.90625, + "completions/mean_terminated_length": 402.90625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.14150843322371912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.022967512253671885, + "learning_rate": 7.466799999999999e-06, + "loss": 0.0097, + "num_tokens": 61746978.0, + "reward": 3.1534409523010254, + "reward_std": 0.715872049331665, + "rewards/reward_fn/mean": 3.1534409523010254, + "rewards/reward_fn/std": 0.7158719897270203, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 230.6875, + "completions/mean_terminated_length": 230.6875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.141614511509494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.031314355321228504, + "learning_rate": 7.4664e-06, + "loss": 0.0287, + "num_tokens": 61796344.0, + "reward": 3.0475525856018066, + "reward_std": 0.18288551270961761, + "rewards/reward_fn/mean": 3.0475525856018066, + "rewards/reward_fn/std": 0.18288545310497284, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.14172058979526891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.021358093596063554, + "learning_rate": 7.466e-06, + "loss": 0.0009, + "num_tokens": 61823100.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 483.21875, + "completions/mean_terminated_length": 483.21875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.1418266680810438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.02461231197230518, + "learning_rate": 7.4656e-06, + "loss": -0.0173, + "num_tokens": 61868451.0, + "reward": 3.531749725341797, + "reward_std": 0.7565776705741882, + "rewards/reward_fn/mean": 3.531749725341797, + "rewards/reward_fn/std": 0.7565776705741882, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 239.625, + "completions/mean_terminated_length": 239.625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.1419327463668187, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.022753535537049174, + "learning_rate": 7.4652e-06, + "loss": 0.0128, + "num_tokens": 61899671.0, + "reward": 3.928053855895996, + "reward_std": 0.4069896936416626, + "rewards/reward_fn/mean": 3.928053855895996, + "rewards/reward_fn/std": 0.4069896936416626, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 296.0625, + "completions/mean_terminated_length": 296.0625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.14203882465259363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.020033373264595866, + "learning_rate": 7.4648e-06, + "loss": -0.1692, + "num_tokens": 61940665.0, + "reward": 3.5547611713409424, + "reward_std": 0.5535134077072144, + "rewards/reward_fn/mean": 3.5547611713409424, + "rewards/reward_fn/std": 0.5535133481025696, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1421449029383685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.019615008728578687, + "learning_rate": 7.4644e-06, + "loss": -0.0367, + "num_tokens": 61986261.0, + "reward": 3.111820697784424, + "reward_std": 0.4804832935333252, + "rewards/reward_fn/mean": 3.111820697784424, + "rewards/reward_fn/std": 0.4804832339286804, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 203.0, + "completions/mean_terminated_length": 203.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.14225098122414342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.020947684068232775, + "learning_rate": 7.464e-06, + "loss": 0.0008, + "num_tokens": 62027157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 392.96875, + "completions/mean_terminated_length": 392.96875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.1423570595099183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.021076830103993416, + "learning_rate": 7.463599999999999e-06, + "loss": 0.0762, + "num_tokens": 62076052.0, + "reward": 2.854060173034668, + "reward_std": 0.03796735033392906, + "rewards/reward_fn/mean": 2.854060173034668, + "rewards/reward_fn/std": 0.03796736150979996, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 373.6875, + "completions/mean_terminated_length": 373.6875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.14246313779569322, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01754242356400937, + "learning_rate": 7.463199999999999e-06, + "loss": 0.1616, + "num_tokens": 62128586.0, + "reward": 2.9887051582336426, + "reward_std": 0.04001903906464577, + "rewards/reward_fn/mean": 2.9887051582336426, + "rewards/reward_fn/std": 0.04001903906464577, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 348.9375, + "completions/mean_terminated_length": 348.9375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.14256921608146814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.022171859396621585, + "learning_rate": 7.462799999999999e-06, + "loss": -0.0195, + "num_tokens": 62174152.0, + "reward": 3.848252296447754, + "reward_std": 0.40802672505378723, + "rewards/reward_fn/mean": 3.848252296447754, + "rewards/reward_fn/std": 0.40802669525146484, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 297.03125, + "completions/mean_terminated_length": 297.03125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.14267529436724302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.02330731856636703, + "learning_rate": 7.462399999999999e-06, + "loss": 0.1368, + "num_tokens": 62215305.0, + "reward": 3.5138566493988037, + "reward_std": 0.5279030203819275, + "rewards/reward_fn/mean": 3.5138566493988037, + "rewards/reward_fn/std": 0.5279030203819275, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.14278137265301794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.024088377133011818, + "learning_rate": 7.461999999999999e-06, + "loss": 0.1033, + "num_tokens": 62256449.0, + "reward": 2.7401795387268066, + "reward_std": 0.04600340500473976, + "rewards/reward_fn/mean": 2.7401795387268066, + "rewards/reward_fn/std": 0.04600339010357857, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 241.09375, + "completions/mean_terminated_length": 241.09375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.14288745093879282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.025438385782763362, + "learning_rate": 7.4616e-06, + "loss": 0.0087, + "num_tokens": 62301764.0, + "reward": 2.7687675952911377, + "reward_std": 0.29931241273880005, + "rewards/reward_fn/mean": 2.7687675952911377, + "rewards/reward_fn/std": 0.29931241273880005, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 411.78125, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.14299352922456773, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.02156771393492818, + "learning_rate": 7.4612e-06, + "loss": 0.297, + "num_tokens": 62350557.0, + "reward": 2.8675591945648193, + "reward_std": 0.6260008215904236, + "rewards/reward_fn/mean": 2.8675591945648193, + "rewards/reward_fn/std": 0.626000702381134, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 456.625, + "completions/mean_terminated_length": 456.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.14309960751034262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.025939938612282276, + "learning_rate": 7.4608e-06, + "loss": 0.0314, + "num_tokens": 62397009.0, + "reward": 2.5841550827026367, + "reward_std": 0.6542167067527771, + "rewards/reward_fn/mean": 2.5841550827026367, + "rewards/reward_fn/std": 0.6542167067527771, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 198.34375, + "completions/mean_terminated_length": 198.34375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.14320568579611753, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.030676579335704446, + "learning_rate": 7.4604e-06, + "loss": 0.0779, + "num_tokens": 62434076.0, + "reward": 3.877383232116699, + "reward_std": 0.29128482937812805, + "rewards/reward_fn/mean": 3.877383232116699, + "rewards/reward_fn/std": 0.29128485918045044, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 121.3125, + "completions/mean_terminated_length": 121.3125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.14331176408189245, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.02987167122773826, + "learning_rate": 7.46e-06, + "loss": 0.0955, + "num_tokens": 62480742.0, + "reward": 3.9019250869750977, + "reward_std": 0.3102291226387024, + "rewards/reward_fn/mean": 3.9019250869750977, + "rewards/reward_fn/std": 0.3102291226387024, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 254.5, + "completions/mean_terminated_length": 254.5, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.14341784236766733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.019354867981746793, + "learning_rate": 7.4596e-06, + "loss": -0.0729, + "num_tokens": 62527126.0, + "reward": 3.7116589546203613, + "reward_std": 0.7752918601036072, + "rewards/reward_fn/mean": 3.7116589546203613, + "rewards/reward_fn/std": 0.7752918004989624, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 185.0625, + "completions/mean_terminated_length": 185.0625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.14352392065344224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.228515625, + "kl": 0.027313646278344095, + "learning_rate": 7.4592e-06, + "loss": 0.0011, + "num_tokens": 62587512.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 241.75, + "completions/mean_terminated_length": 241.75, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.14362999893921713, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.03514736890792847, + "learning_rate": 7.4588e-06, + "loss": 0.0202, + "num_tokens": 62632432.0, + "reward": 3.8626348972320557, + "reward_std": 0.5405560731887817, + "rewards/reward_fn/mean": 3.8626348972320557, + "rewards/reward_fn/std": 0.5405560731887817, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 239.65625, + "completions/mean_terminated_length": 239.65625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.14373607722499204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.02487537218257785, + "learning_rate": 7.4584e-06, + "loss": 0.1213, + "num_tokens": 62670565.0, + "reward": 2.8300230503082275, + "reward_std": 0.05813976749777794, + "rewards/reward_fn/mean": 2.8300230503082275, + "rewards/reward_fn/std": 0.05813978984951973, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 670.15625, + "completions/mean_terminated_length": 625.7096557617188, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.14384215551076696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.01770936872344464, + "learning_rate": 7.4579999999999996e-06, + "loss": 0.0846, + "num_tokens": 62727050.0, + "reward": 2.2211742401123047, + "reward_std": 0.5450964570045471, + "rewards/reward_fn/mean": 2.2211742401123047, + "rewards/reward_fn/std": 0.5450963973999023, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 405.46875, + "completions/mean_terminated_length": 405.46875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.14394823379654184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.023628632072359324, + "learning_rate": 7.4575999999999995e-06, + "loss": -0.0418, + "num_tokens": 62782905.0, + "reward": 2.8233463764190674, + "reward_std": 0.1016487330198288, + "rewards/reward_fn/mean": 2.8233463764190674, + "rewards/reward_fn/std": 0.10164876282215118, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 351.5625, + "completions/mean_terminated_length": 351.5625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.14405431208231675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.018899488961324096, + "learning_rate": 7.4571999999999995e-06, + "loss": 0.022, + "num_tokens": 62821035.0, + "reward": 3.8341493606567383, + "reward_std": 0.48483148217201233, + "rewards/reward_fn/mean": 3.8341493606567383, + "rewards/reward_fn/std": 0.48483148217201233, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 360.09375, + "completions/mean_terminated_length": 360.09375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.14416039036809164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.023247625678777695, + "learning_rate": 7.4568e-06, + "loss": -0.0009, + "num_tokens": 62881390.0, + "reward": 3.9614830017089844, + "reward_std": 0.21788454055786133, + "rewards/reward_fn/mean": 3.9614830017089844, + "rewards/reward_fn/std": 0.2178845852613449, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 254.59375, + "completions/mean_terminated_length": 254.59375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.14426646865386655, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.02458102209493518, + "learning_rate": 7.4564e-06, + "loss": -0.1313, + "num_tokens": 62935649.0, + "reward": 2.5300705432891846, + "reward_std": 0.38768091797828674, + "rewards/reward_fn/mean": 2.5300705432891846, + "rewards/reward_fn/std": 0.38768091797828674, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 181.6875, + "completions/mean_terminated_length": 181.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.14437254693964147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.018385571893304586, + "learning_rate": 7.456e-06, + "loss": 0.0007, + "num_tokens": 62963543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.14447862522541635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.022040294017642736, + "learning_rate": 7.455599999999999e-06, + "loss": 0.0123, + "num_tokens": 63015603.0, + "reward": 2.984866142272949, + "reward_std": 0.4643891155719757, + "rewards/reward_fn/mean": 2.984866142272949, + "rewards/reward_fn/std": 0.4643890857696533, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 218.3125, + "completions/mean_terminated_length": 218.3125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.14458470351119126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.02735855709761381, + "learning_rate": 7.455199999999999e-06, + "loss": 0.0325, + "num_tokens": 63057917.0, + "reward": 3.982016086578369, + "reward_std": 0.1017315685749054, + "rewards/reward_fn/mean": 3.982016086578369, + "rewards/reward_fn/std": 0.1017315685749054, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1252.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 300.25, + "completions/mean_terminated_length": 300.25, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.14469078179696615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.027118426747620106, + "learning_rate": 7.454799999999999e-06, + "loss": 0.0445, + "num_tokens": 63103237.0, + "reward": 2.8830158710479736, + "reward_std": 0.4302554130554199, + "rewards/reward_fn/mean": 2.8830158710479736, + "rewards/reward_fn/std": 0.4302554130554199, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 698.09375, + "completions/mean_terminated_length": 608.1000366210938, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.14479686008274106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.02318622707389295, + "learning_rate": 7.454399999999999e-06, + "loss": 0.2151, + "num_tokens": 63185608.0, + "reward": 2.6326918601989746, + "reward_std": 0.7804985046386719, + "rewards/reward_fn/mean": 2.6326918601989746, + "rewards/reward_fn/std": 0.7804984450340271, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 341.96875, + "completions/mean_terminated_length": 341.96875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.14490293836851598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.02670998009853065, + "learning_rate": 7.453999999999999e-06, + "loss": -0.0004, + "num_tokens": 63233831.0, + "reward": 2.599997043609619, + "reward_std": 0.41961222887039185, + "rewards/reward_fn/mean": 2.599997043609619, + "rewards/reward_fn/std": 0.41961225867271423, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1778.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 575.65625, + "completions/mean_terminated_length": 575.65625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.14500901665429086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.02400823961943388, + "learning_rate": 7.453599999999999e-06, + "loss": -0.0017, + "num_tokens": 63285564.0, + "reward": 2.339834451675415, + "reward_std": 0.6554206013679504, + "rewards/reward_fn/mean": 2.339834451675415, + "rewards/reward_fn/std": 0.6554206013679504, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 202.46875, + "completions/mean_terminated_length": 202.46875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.14511509494006578, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.025389327201992273, + "learning_rate": 7.453199999999999e-06, + "loss": 0.0224, + "num_tokens": 63332843.0, + "reward": 3.357534408569336, + "reward_std": 0.9463339447975159, + "rewards/reward_fn/mean": 3.357534408569336, + "rewards/reward_fn/std": 0.9463339447975159, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 259.5625, + "completions/mean_terminated_length": 259.5625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.14522117322584066, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.029223348945379257, + "learning_rate": 7.452799999999999e-06, + "loss": 0.071, + "num_tokens": 63381533.0, + "reward": 3.966054916381836, + "reward_std": 0.19202205538749695, + "rewards/reward_fn/mean": 3.966054916381836, + "rewards/reward_fn/std": 0.19202204048633575, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 361.6875, + "completions/mean_terminated_length": 361.6875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.14532725151161557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.027676680823788047, + "learning_rate": 7.452399999999999e-06, + "loss": 0.0042, + "num_tokens": 63430131.0, + "reward": 3.385025978088379, + "reward_std": 0.6906515955924988, + "rewards/reward_fn/mean": 3.385025978088379, + "rewards/reward_fn/std": 0.690651535987854, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 266.1875, + "completions/mean_terminated_length": 266.1875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.1454333297973905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.031021540984511375, + "learning_rate": 7.452e-06, + "loss": -0.0983, + "num_tokens": 63458841.0, + "reward": 3.9695372581481934, + "reward_std": 0.17232412099838257, + "rewards/reward_fn/mean": 3.9695372581481934, + "rewards/reward_fn/std": 0.17232413589954376, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 255.28125, + "completions/mean_terminated_length": 255.28125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.14553940808316537, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.025465503567829728, + "learning_rate": 7.4516e-06, + "loss": -0.0179, + "num_tokens": 63502786.0, + "reward": 2.7831175327301025, + "reward_std": 0.0531310960650444, + "rewards/reward_fn/mean": 2.7831175327301025, + "rewards/reward_fn/std": 0.0531311109662056, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1434.0, + "completions/max_terminated_length": 1434.0, + "completions/mean_length": 353.78125, + "completions/mean_terminated_length": 353.78125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.14564548636894029, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.020204479689709842, + "learning_rate": 7.4512e-06, + "loss": -0.1409, + "num_tokens": 63544891.0, + "reward": 2.8671717643737793, + "reward_std": 0.0845094546675682, + "rewards/reward_fn/mean": 2.8671717643737793, + "rewards/reward_fn/std": 0.08450954407453537, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 163.90625, + "completions/mean_terminated_length": 163.90625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.14575156465471517, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.028385634068399668, + "learning_rate": 7.4508e-06, + "loss": 0.0794, + "num_tokens": 63568504.0, + "reward": 3.8483242988586426, + "reward_std": 0.35819515585899353, + "rewards/reward_fn/mean": 3.8483242988586426, + "rewards/reward_fn/std": 0.35819512605667114, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1143.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 325.21875, + "completions/mean_terminated_length": 325.21875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.14585764294049008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.025249871658161283, + "learning_rate": 7.4504e-06, + "loss": 0.046, + "num_tokens": 63615071.0, + "reward": 3.6527295112609863, + "reward_std": 0.4878491461277008, + "rewards/reward_fn/mean": 3.6527295112609863, + "rewards/reward_fn/std": 0.4878491461277008, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 345.09375, + "completions/mean_terminated_length": 345.09375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.14596372122626497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.028095704270526767, + "learning_rate": 7.45e-06, + "loss": 0.0772, + "num_tokens": 63656162.0, + "reward": 3.285367012023926, + "reward_std": 0.9010963439941406, + "rewards/reward_fn/mean": 3.285367012023926, + "rewards/reward_fn/std": 0.9010962843894958, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 288.65625, + "completions/mean_terminated_length": 288.65625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.14606979951203988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.028274354292079806, + "learning_rate": 7.4496e-06, + "loss": -0.0781, + "num_tokens": 63712439.0, + "reward": 3.466069459915161, + "reward_std": 0.9559970498085022, + "rewards/reward_fn/mean": 3.466069459915161, + "rewards/reward_fn/std": 0.9559970498085022, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 259.84375, + "completions/mean_terminated_length": 259.84375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.1461758777978148, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.027279690839350224, + "learning_rate": 7.4492e-06, + "loss": 0.0394, + "num_tokens": 63773586.0, + "reward": 3.8966031074523926, + "reward_std": 0.43655630946159363, + "rewards/reward_fn/mean": 3.8966031074523926, + "rewards/reward_fn/std": 0.436556339263916, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 345.25, + "completions/mean_terminated_length": 345.25, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.14628195608358968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.027204215060919523, + "learning_rate": 7.4488e-06, + "loss": -0.05, + "num_tokens": 63805786.0, + "reward": 3.292912721633911, + "reward_std": 0.6367733478546143, + "rewards/reward_fn/mean": 3.292912721633911, + "rewards/reward_fn/std": 0.6367732882499695, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 208.3125, + "completions/mean_terminated_length": 208.3125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1463880343693646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.029668390285223722, + "learning_rate": 7.4484e-06, + "loss": -0.0387, + "num_tokens": 63848260.0, + "reward": 3.8188915252685547, + "reward_std": 0.5969631671905518, + "rewards/reward_fn/mean": 3.8188915252685547, + "rewards/reward_fn/std": 0.5969631671905518, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 286.4375, + "completions/mean_terminated_length": 286.4375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.14649411265513948, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.0279810291249305, + "learning_rate": 7.448e-06, + "loss": 0.3408, + "num_tokens": 63904946.0, + "reward": 3.9252352714538574, + "reward_std": 0.42293301224708557, + "rewards/reward_fn/mean": 3.9252352714538574, + "rewards/reward_fn/std": 0.4229329824447632, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.1466001909409144, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.02967662224546075, + "learning_rate": 7.4476000000000005e-06, + "loss": -0.0416, + "num_tokens": 63944174.0, + "reward": 2.6933159828186035, + "reward_std": 0.2893867492675781, + "rewards/reward_fn/mean": 2.6933159828186035, + "rewards/reward_fn/std": 0.2893867492675781, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1121.0, + "completions/max_terminated_length": 1121.0, + "completions/mean_length": 368.09375, + "completions/mean_terminated_length": 368.09375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.1467062692266893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0331441021990031, + "learning_rate": 7.4472e-06, + "loss": 0.06, + "num_tokens": 63986289.0, + "reward": 2.744060516357422, + "reward_std": 0.042238347232341766, + "rewards/reward_fn/mean": 2.744060516357422, + "rewards/reward_fn/std": 0.042238280177116394, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 203.78125, + "completions/mean_terminated_length": 203.78125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1468123475124642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44140625, + "kl": 0.03312438074499369, + "learning_rate": 7.4468e-06, + "loss": 0.0013, + "num_tokens": 64038122.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 193.25, + "completions/mean_terminated_length": 193.25, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.1469184257982391, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.03683407441712916, + "learning_rate": 7.4463999999999996e-06, + "loss": 0.1946, + "num_tokens": 64075122.0, + "reward": 2.924485206604004, + "reward_std": 0.07657773792743683, + "rewards/reward_fn/mean": 2.924485206604004, + "rewards/reward_fn/std": 0.07657775282859802, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1439.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 390.125, + "completions/mean_terminated_length": 390.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.147024504084014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.03401468298397958, + "learning_rate": 7.4459999999999995e-06, + "loss": 0.0587, + "num_tokens": 64122710.0, + "reward": 3.0861051082611084, + "reward_std": 0.702394425868988, + "rewards/reward_fn/mean": 3.0861051082611084, + "rewards/reward_fn/std": 0.702394425868988, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 278.09375, + "completions/mean_terminated_length": 278.09375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.1471305823697889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.028455683263018727, + "learning_rate": 7.4455999999999995e-06, + "loss": 0.0261, + "num_tokens": 64169593.0, + "reward": 3.964691162109375, + "reward_std": 0.19973696768283844, + "rewards/reward_fn/mean": 3.964691162109375, + "rewards/reward_fn/std": 0.19973698258399963, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 174.03125, + "completions/mean_terminated_length": 174.03125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.14723666065556382, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.03542444505728781, + "learning_rate": 7.4451999999999995e-06, + "loss": -0.0303, + "num_tokens": 64193594.0, + "reward": 3.883316993713379, + "reward_std": 0.314423143863678, + "rewards/reward_fn/mean": 3.883316993713379, + "rewards/reward_fn/std": 0.314423143863678, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.1473427389413387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.022355019696988165, + "learning_rate": 7.4447999999999994e-06, + "loss": 0.0009, + "num_tokens": 64246154.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 207.96875, + "completions/mean_terminated_length": 207.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.14744881722711362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.03179771360009909, + "learning_rate": 7.444399999999999e-06, + "loss": 0.0461, + "num_tokens": 64288105.0, + "reward": 3.929905891418457, + "reward_std": 0.3965129852294922, + "rewards/reward_fn/mean": 3.929905891418457, + "rewards/reward_fn/std": 0.3965129852294922, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 160.03125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1475548955128885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.03064539493061602, + "learning_rate": 7.443999999999999e-06, + "loss": 0.0012, + "num_tokens": 64329898.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 347.625, + "completions/mean_terminated_length": 347.625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.14766097379866341, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.020091792102903128, + "learning_rate": 7.443599999999999e-06, + "loss": 0.0558, + "num_tokens": 64377246.0, + "reward": 2.9814658164978027, + "reward_std": 0.2760011851787567, + "rewards/reward_fn/mean": 2.9814658164978027, + "rewards/reward_fn/std": 0.2760012149810791, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 332.21875, + "completions/mean_terminated_length": 332.21875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.14776705208443833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.026977359084412456, + "learning_rate": 7.443199999999999e-06, + "loss": 0.0893, + "num_tokens": 64425349.0, + "reward": 3.6816189289093018, + "reward_std": 0.5174549221992493, + "rewards/reward_fn/mean": 3.6816189289093018, + "rewards/reward_fn/std": 0.5174549221992493, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1478731303702132, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.03630391927435994, + "learning_rate": 7.4428e-06, + "loss": 0.1482, + "num_tokens": 64472041.0, + "reward": 2.634312391281128, + "reward_std": 0.2737848460674286, + "rewards/reward_fn/mean": 2.634312391281128, + "rewards/reward_fn/std": 0.273784875869751, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 286.0625, + "completions/mean_terminated_length": 286.0625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.14797920865598813, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.033690345007926226, + "learning_rate": 7.4424e-06, + "loss": 0.1175, + "num_tokens": 64514219.0, + "reward": 2.9914069175720215, + "reward_std": 0.0645083636045456, + "rewards/reward_fn/mean": 2.9914069175720215, + "rewards/reward_fn/std": 0.064508356153965, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 168.9375, + "completions/mean_terminated_length": 168.9375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.148085286941763, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.02721068379469216, + "learning_rate": 7.442e-06, + "loss": -0.0126, + "num_tokens": 64553929.0, + "reward": 3.966176986694336, + "reward_std": 0.19133220613002777, + "rewards/reward_fn/mean": 3.966176986694336, + "rewards/reward_fn/std": 0.19133223593235016, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 379.78125, + "completions/mean_terminated_length": 379.78125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.14819136522753792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.022854552837088704, + "learning_rate": 7.4416e-06, + "loss": 0.105, + "num_tokens": 64602178.0, + "reward": 3.60275936126709, + "reward_std": 0.5985162854194641, + "rewards/reward_fn/mean": 3.60275936126709, + "rewards/reward_fn/std": 0.5985162854194641, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 244.21875, + "completions/mean_terminated_length": 244.21875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.14829744351331284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.02474433882161975, + "learning_rate": 7.4412e-06, + "loss": 0.001, + "num_tokens": 64664329.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 387.8125, + "completions/mean_terminated_length": 387.8125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.14840352179908772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.03332933643832803, + "learning_rate": 7.4408e-06, + "loss": 0.0906, + "num_tokens": 64720035.0, + "reward": 3.689403533935547, + "reward_std": 0.5056177377700806, + "rewards/reward_fn/mean": 3.689403533935547, + "rewards/reward_fn/std": 0.5056177377700806, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 173.84375, + "completions/mean_terminated_length": 173.84375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.14850960008486264, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.0332620891276747, + "learning_rate": 7.4404e-06, + "loss": -0.0072, + "num_tokens": 64769854.0, + "reward": 3.5542819499969482, + "reward_std": 0.5167267322540283, + "rewards/reward_fn/mean": 3.5542819499969482, + "rewards/reward_fn/std": 0.5167266726493835, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 259.8125, + "completions/mean_terminated_length": 259.8125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.14861567837063752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.03940536780282855, + "learning_rate": 7.44e-06, + "loss": 0.0286, + "num_tokens": 64810296.0, + "reward": 3.590843915939331, + "reward_std": 0.5371227860450745, + "rewards/reward_fn/mean": 3.590843915939331, + "rewards/reward_fn/std": 0.5371227860450745, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 370.21875, + "completions/mean_terminated_length": 370.21875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.14872175665641243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.025106285698711872, + "learning_rate": 7.4396e-06, + "loss": -0.0491, + "num_tokens": 64859839.0, + "reward": 3.833078384399414, + "reward_std": 0.48529359698295593, + "rewards/reward_fn/mean": 3.833078384399414, + "rewards/reward_fn/std": 0.48529356718063354, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 468.8125, + "completions/mean_terminated_length": 417.8709411621094, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.14882783494218735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.030068765161558986, + "learning_rate": 7.439199999999999e-06, + "loss": 0.2232, + "num_tokens": 64905561.0, + "reward": 3.2034428119659424, + "reward_std": 0.8694138526916504, + "rewards/reward_fn/mean": 3.2034428119659424, + "rewards/reward_fn/std": 0.8694137930870056, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 229.4375, + "completions/mean_terminated_length": 229.4375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.14893391322796223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.028407067991793156, + "learning_rate": 7.438799999999999e-06, + "loss": -0.0417, + "num_tokens": 64933799.0, + "reward": 3.6446433067321777, + "reward_std": 0.7922115325927734, + "rewards/reward_fn/mean": 3.6446433067321777, + "rewards/reward_fn/std": 0.7922114729881287, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 278.40625, + "completions/mean_terminated_length": 278.40625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.14903999151373715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.027859984431415796, + "learning_rate": 7.438399999999999e-06, + "loss": -0.0629, + "num_tokens": 64963572.0, + "reward": 3.877469062805176, + "reward_std": 0.3294479548931122, + "rewards/reward_fn/mean": 3.877469062805176, + "rewards/reward_fn/std": 0.3294479250907898, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 463.0625, + "completions/mean_terminated_length": 463.0625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.14914606979951203, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.03835621988400817, + "learning_rate": 7.438e-06, + "loss": -0.1281, + "num_tokens": 65035190.0, + "reward": 2.7208948135375977, + "reward_std": 0.7465806007385254, + "rewards/reward_fn/mean": 2.7208948135375977, + "rewards/reward_fn/std": 0.7465806603431702, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 334.3125, + "completions/mean_terminated_length": 334.3125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.14925214808528695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.022742543602362275, + "learning_rate": 7.4376e-06, + "loss": 0.0009, + "num_tokens": 65065216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.14935822637106183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.02962300064973533, + "learning_rate": 7.4372e-06, + "loss": 0.0012, + "num_tokens": 65106340.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 135.65625, + "completions/mean_terminated_length": 135.65625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.14946430465683674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.031661511631682515, + "learning_rate": 7.4368e-06, + "loss": 0.0013, + "num_tokens": 65160281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 239.4375, + "completions/mean_terminated_length": 239.4375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.14957038294261166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.033848599530756474, + "learning_rate": 7.4364e-06, + "loss": 0.1764, + "num_tokens": 65201351.0, + "reward": 3.1107335090637207, + "reward_std": 0.08597031980752945, + "rewards/reward_fn/mean": 3.1107335090637207, + "rewards/reward_fn/std": 0.08597029000520706, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 237.09375, + "completions/mean_terminated_length": 237.09375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.14967646122838654, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.03387490310706198, + "learning_rate": 7.436e-06, + "loss": -0.0979, + "num_tokens": 65241162.0, + "reward": 2.9534237384796143, + "reward_std": 0.7266772985458374, + "rewards/reward_fn/mean": 2.9534237384796143, + "rewards/reward_fn/std": 0.7266772389411926, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 288.375, + "completions/mean_terminated_length": 288.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.14978253951416146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.031444058986380696, + "learning_rate": 7.4356e-06, + "loss": 0.0013, + "num_tokens": 65267030.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 256.375, + "completions/mean_terminated_length": 256.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.14988861779993634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.029305062256753445, + "learning_rate": 7.4351999999999996e-06, + "loss": 0.0012, + "num_tokens": 65316098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 83.40625, + "completions/mean_terminated_length": 83.40625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.14999469608571125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.02644193370360881, + "learning_rate": 7.4347999999999995e-06, + "loss": 0.0011, + "num_tokens": 65339471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1957.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 366.46875, + "completions/mean_terminated_length": 366.46875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.15010077437148617, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.04253972531296313, + "learning_rate": 7.4343999999999995e-06, + "loss": -0.2152, + "num_tokens": 65380158.0, + "reward": 2.7263221740722656, + "reward_std": 0.6098658442497253, + "rewards/reward_fn/mean": 2.7263221740722656, + "rewards/reward_fn/std": 0.6098658442497253, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1767.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 599.03125, + "completions/mean_terminated_length": 599.03125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.15020685265726105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.029143204214051366, + "learning_rate": 7.4339999999999995e-06, + "loss": 0.0824, + "num_tokens": 65433343.0, + "reward": 3.3289918899536133, + "reward_std": 0.602114200592041, + "rewards/reward_fn/mean": 3.3289918899536133, + "rewards/reward_fn/std": 0.602114200592041, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 218.375, + "completions/mean_terminated_length": 218.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.15031293094303597, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.030435963766649365, + "learning_rate": 7.4336e-06, + "loss": -0.0891, + "num_tokens": 65485099.0, + "reward": 3.7489049434661865, + "reward_std": 0.6941893696784973, + "rewards/reward_fn/mean": 3.7489049434661865, + "rewards/reward_fn/std": 0.6941893696784973, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 183.28125, + "completions/mean_terminated_length": 183.28125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.15041900922881085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.03691709297709167, + "learning_rate": 7.4332e-06, + "loss": 0.0015, + "num_tokens": 65524468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 109.375, + "completions/mean_terminated_length": 109.375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.15052508751458576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.205078125, + "kl": 0.039213865995407104, + "learning_rate": 7.4328e-06, + "loss": 0.0016, + "num_tokens": 65552992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 446.96875, + "completions/mean_terminated_length": 395.32257080078125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.15063116580036068, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.022784803761169314, + "learning_rate": 7.4324e-06, + "loss": 0.3249, + "num_tokens": 65605791.0, + "reward": 2.73966646194458, + "reward_std": 0.5022208094596863, + "rewards/reward_fn/mean": 2.73966646194458, + "rewards/reward_fn/std": 0.5022208094596863, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 178.9375, + "completions/mean_terminated_length": 178.9375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.15073724408613556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.53125, + "kl": 0.032336236676201224, + "learning_rate": 7.432e-06, + "loss": 0.1029, + "num_tokens": 65642141.0, + "reward": 3.7554450035095215, + "reward_std": 0.3876785337924957, + "rewards/reward_fn/mean": 3.7554450035095215, + "rewards/reward_fn/std": 0.38767850399017334, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 208.1875, + "completions/mean_terminated_length": 208.1875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.15084332237191048, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.035623660776764154, + "learning_rate": 7.4316e-06, + "loss": 0.1096, + "num_tokens": 65681603.0, + "reward": 2.7092933654785156, + "reward_std": 0.28102341294288635, + "rewards/reward_fn/mean": 2.7092933654785156, + "rewards/reward_fn/std": 0.28102338314056396, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 158.34375, + "completions/mean_terminated_length": 158.34375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.15094940065768536, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.03153680078685284, + "learning_rate": 7.431199999999999e-06, + "loss": -0.0881, + "num_tokens": 65710254.0, + "reward": 3.8465654850006104, + "reward_std": 0.3623442053794861, + "rewards/reward_fn/mean": 3.8465654850006104, + "rewards/reward_fn/std": 0.36234423518180847, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1457.0, + "completions/mean_length": 714.375, + "completions/mean_terminated_length": 576.413818359375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.15105547894346028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.028179930755868554, + "learning_rate": 7.430799999999999e-06, + "loss": 0.0505, + "num_tokens": 65775770.0, + "reward": 2.2804367542266846, + "reward_std": 0.8255341053009033, + "rewards/reward_fn/mean": 2.2804367542266846, + "rewards/reward_fn/std": 0.8255340456962585, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 91.0625, + "completions/mean_terminated_length": 91.0625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.1511615572292352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.029801467899233103, + "learning_rate": 7.430399999999999e-06, + "loss": 0.0012, + "num_tokens": 65809468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 283.5, + "completions/mean_terminated_length": 283.5, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.15126763551501007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.02984489407390356, + "learning_rate": 7.429999999999999e-06, + "loss": -0.0261, + "num_tokens": 65834380.0, + "reward": 3.668632984161377, + "reward_std": 0.538867175579071, + "rewards/reward_fn/mean": 3.668632984161377, + "rewards/reward_fn/std": 0.5388672351837158, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 238.28125, + "completions/mean_terminated_length": 238.28125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.151373713800785, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.023655685363337398, + "learning_rate": 7.429599999999999e-06, + "loss": 0.1598, + "num_tokens": 65875157.0, + "reward": 3.9262659549713135, + "reward_std": 0.41710224747657776, + "rewards/reward_fn/mean": 3.9262659549713135, + "rewards/reward_fn/std": 0.41710227727890015, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 348.1875, + "completions/mean_terminated_length": 348.1875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.15147979208655987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02328518428839743, + "learning_rate": 7.429199999999999e-06, + "loss": -0.0487, + "num_tokens": 65924859.0, + "reward": 3.6566414833068848, + "reward_std": 0.5582861304283142, + "rewards/reward_fn/mean": 3.6566414833068848, + "rewards/reward_fn/std": 0.5582861304283142, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 348.0625, + "completions/mean_terminated_length": 348.0625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.15158587037233479, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.022434451850131154, + "learning_rate": 7.4288e-06, + "loss": 0.0641, + "num_tokens": 65965245.0, + "reward": 2.7481746673583984, + "reward_std": 0.4162106215953827, + "rewards/reward_fn/mean": 2.7481746673583984, + "rewards/reward_fn/std": 0.4162106215953827, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 133.71875, + "completions/mean_terminated_length": 133.71875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.1516919486581097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18359375, + "kl": 0.04026283789426088, + "learning_rate": 7.4284e-06, + "loss": 0.0016, + "num_tokens": 66012404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1083.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 337.6875, + "completions/mean_terminated_length": 337.6875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.15179802694388458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.026995733845978975, + "learning_rate": 7.428e-06, + "loss": 0.0028, + "num_tokens": 66056682.0, + "reward": 3.900752544403076, + "reward_std": 0.3138922154903412, + "rewards/reward_fn/mean": 3.900752544403076, + "rewards/reward_fn/std": 0.3138922154903412, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1176.0, + "completions/max_terminated_length": 1176.0, + "completions/mean_length": 227.59375, + "completions/mean_terminated_length": 227.59375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.1519041052296595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.02565616718493402, + "learning_rate": 7.4276e-06, + "loss": 0.001, + "num_tokens": 66096317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1328.0, + "completions/max_terminated_length": 1328.0, + "completions/mean_length": 389.125, + "completions/mean_terminated_length": 389.125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.15201018351543438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.023474588757380843, + "learning_rate": 7.4272e-06, + "loss": 0.1392, + "num_tokens": 66153665.0, + "reward": 3.253533363342285, + "reward_std": 0.5899655818939209, + "rewards/reward_fn/mean": 3.253533363342285, + "rewards/reward_fn/std": 0.5899655818939209, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 161.28125, + "completions/mean_terminated_length": 161.28125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1521162618012093, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.03629247797653079, + "learning_rate": 7.4268e-06, + "loss": 0.0981, + "num_tokens": 66189610.0, + "reward": 3.0274600982666016, + "reward_std": 0.05274336412549019, + "rewards/reward_fn/mean": 3.0274600982666016, + "rewards/reward_fn/std": 0.05274338647723198, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 248.15625, + "completions/mean_terminated_length": 248.15625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.15222234008698418, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.029007033677771688, + "learning_rate": 7.4264e-06, + "loss": 0.0845, + "num_tokens": 66229775.0, + "reward": 3.6650195121765137, + "reward_std": 0.5066304802894592, + "rewards/reward_fn/mean": 3.6650195121765137, + "rewards/reward_fn/std": 0.5066304802894592, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 224.375, + "completions/mean_terminated_length": 224.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1523284183727591, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.023438591044396162, + "learning_rate": 7.426e-06, + "loss": 0.1349, + "num_tokens": 66266011.0, + "reward": 3.8179659843444824, + "reward_std": 0.3869030177593231, + "rewards/reward_fn/mean": 3.8179659843444824, + "rewards/reward_fn/std": 0.3869030177593231, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 294.5, + "completions/mean_terminated_length": 294.5, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.152434496658534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.023724337574094534, + "learning_rate": 7.4256e-06, + "loss": -0.069, + "num_tokens": 66319435.0, + "reward": 3.4863743782043457, + "reward_std": 0.593258798122406, + "rewards/reward_fn/mean": 3.4863743782043457, + "rewards/reward_fn/std": 0.5932587385177612, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 292.6875, + "completions/mean_terminated_length": 292.6875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.1525405749443089, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.033242793986573815, + "learning_rate": 7.4252e-06, + "loss": 0.0595, + "num_tokens": 66366145.0, + "reward": 2.89218807220459, + "reward_std": 0.46033063530921936, + "rewards/reward_fn/mean": 2.89218807220459, + "rewards/reward_fn/std": 0.46033063530921936, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 487.8125, + "completions/mean_terminated_length": 487.8125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.1526466532300838, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.375, + "kl": 0.02517133066430688, + "learning_rate": 7.4248e-06, + "loss": 0.1089, + "num_tokens": 66419227.0, + "reward": 2.5610084533691406, + "reward_std": 0.344752699136734, + "rewards/reward_fn/mean": 2.5610084533691406, + "rewards/reward_fn/std": 0.344752699136734, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.1527527315158587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.03170533524826169, + "learning_rate": 7.4244e-06, + "loss": 0.0013, + "num_tokens": 66465399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 403.78125, + "completions/mean_terminated_length": 403.78125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.1528588098016336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.03218272537924349, + "learning_rate": 7.424e-06, + "loss": 0.0298, + "num_tokens": 66512848.0, + "reward": 2.8888301849365234, + "reward_std": 0.0484078973531723, + "rewards/reward_fn/mean": 2.8888301849365234, + "rewards/reward_fn/std": 0.04840795695781708, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 496.625, + "completions/mean_terminated_length": 393.20001220703125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.15296488808740852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.028265361906960607, + "learning_rate": 7.4236e-06, + "loss": 0.3739, + "num_tokens": 66575492.0, + "reward": 3.108246088027954, + "reward_std": 1.1263806819915771, + "rewards/reward_fn/mean": 3.108246088027954, + "rewards/reward_fn/std": 1.1263806819915771, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 208.375, + "completions/mean_terminated_length": 208.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.1530709663731834, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.028314963448792696, + "learning_rate": 7.4231999999999995e-06, + "loss": -0.1091, + "num_tokens": 66614960.0, + "reward": 3.644209146499634, + "reward_std": 0.5789510011672974, + "rewards/reward_fn/mean": 3.644209146499634, + "rewards/reward_fn/std": 0.5789510011672974, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 217.5625, + "completions/mean_terminated_length": 217.5625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.15317704465895832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.028587039094418287, + "learning_rate": 7.4227999999999995e-06, + "loss": -0.0666, + "num_tokens": 66650338.0, + "reward": 3.039794445037842, + "reward_std": 0.03578682616353035, + "rewards/reward_fn/mean": 3.039794445037842, + "rewards/reward_fn/std": 0.035786814987659454, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 164.09375, + "completions/mean_terminated_length": 164.09375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1532831229447332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.03915078402496874, + "learning_rate": 7.4223999999999994e-06, + "loss": 0.0016, + "num_tokens": 66706437.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 211.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.15338920123050812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.03331913007423282, + "learning_rate": 7.421999999999999e-06, + "loss": 0.0013, + "num_tokens": 66736949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 387.59375, + "completions/mean_terminated_length": 387.59375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.15349527951628303, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.02698996476829052, + "learning_rate": 7.421599999999999e-06, + "loss": 0.1122, + "num_tokens": 66783496.0, + "reward": 3.010918617248535, + "reward_std": 0.18900462985038757, + "rewards/reward_fn/mean": 3.010918617248535, + "rewards/reward_fn/std": 0.18900460004806519, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 294.625, + "completions/mean_terminated_length": 294.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.15360135780205791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.03707153582945466, + "learning_rate": 7.421199999999999e-06, + "loss": 0.0015, + "num_tokens": 66830364.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 505.3125, + "completions/mean_terminated_length": 505.3125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.15370743608783283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.027398626087233424, + "learning_rate": 7.420799999999999e-06, + "loss": -0.1571, + "num_tokens": 66878758.0, + "reward": 2.23410701751709, + "reward_std": 0.6652016043663025, + "rewards/reward_fn/mean": 2.23410701751709, + "rewards/reward_fn/std": 0.6652015447616577, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 177.53125, + "completions/mean_terminated_length": 177.53125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1538135143736077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.024694280233234167, + "learning_rate": 7.420399999999999e-06, + "loss": 0.001, + "num_tokens": 66913015.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 98.28125, + "completions/mean_terminated_length": 98.28125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.15391959265938263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.03571772645227611, + "learning_rate": 7.419999999999999e-06, + "loss": 0.0014, + "num_tokens": 66934880.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 349.125, + "completions/mean_terminated_length": 349.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.15402567094515754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.022626615595072508, + "learning_rate": 7.419599999999999e-06, + "loss": 0.0009, + "num_tokens": 66986308.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.15413174923093242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.03107062610797584, + "learning_rate": 7.4192e-06, + "loss": 0.0012, + "num_tokens": 67026176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 296.625, + "completions/mean_terminated_length": 296.625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.15423782751670734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.03296915185637772, + "learning_rate": 7.4188e-06, + "loss": -0.006, + "num_tokens": 67097236.0, + "reward": 2.750486373901367, + "reward_std": 0.2804044187068939, + "rewards/reward_fn/mean": 2.750486373901367, + "rewards/reward_fn/std": 0.2804044485092163, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 259.5625, + "completions/mean_terminated_length": 259.5625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.15434390580248222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.030945046106353402, + "learning_rate": 7.4184e-06, + "loss": -0.0079, + "num_tokens": 67147206.0, + "reward": 2.977381944656372, + "reward_std": 0.19623248279094696, + "rewards/reward_fn/mean": 2.977381944656372, + "rewards/reward_fn/std": 0.19623248279094696, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 77.8125, + "completions/mean_terminated_length": 77.8125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.15444998408825714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.021643459796905518, + "learning_rate": 7.418e-06, + "loss": 0.0009, + "num_tokens": 67174976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 401.6875, + "completions/mean_terminated_length": 401.6875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.15455606237403205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.021151326596736908, + "learning_rate": 7.4176e-06, + "loss": 0.0389, + "num_tokens": 67226486.0, + "reward": 3.7367396354675293, + "reward_std": 0.5055917501449585, + "rewards/reward_fn/mean": 3.7367396354675293, + "rewards/reward_fn/std": 0.5055916905403137, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 540.5625, + "completions/mean_terminated_length": 491.9354553222656, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.15466214065980693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.028610155452042818, + "learning_rate": 7.4172e-06, + "loss": 0.1628, + "num_tokens": 67281736.0, + "reward": 2.7234995365142822, + "reward_std": 0.5659182667732239, + "rewards/reward_fn/mean": 2.7234995365142822, + "rewards/reward_fn/std": 0.5659182667732239, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 180.71875, + "completions/mean_terminated_length": 180.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.15476821894558185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.0297960857860744, + "learning_rate": 7.4168e-06, + "loss": 0.0607, + "num_tokens": 67324383.0, + "reward": 3.912855625152588, + "reward_std": 0.2752879559993744, + "rewards/reward_fn/mean": 3.912855625152588, + "rewards/reward_fn/std": 0.275287926197052, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 84.0, + "completions/mean_terminated_length": 84.0, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.15487429723135673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1708984375, + "kl": 0.028807405149564147, + "learning_rate": 7.4164e-06, + "loss": 0.0012, + "num_tokens": 67354111.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 197.25, + "completions/mean_terminated_length": 197.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.15498037551713165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.028027324238792062, + "learning_rate": 7.416e-06, + "loss": 0.0011, + "num_tokens": 67389799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 243.375, + "completions/mean_terminated_length": 243.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.15508645380290653, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.021245840471237898, + "learning_rate": 7.4156e-06, + "loss": -0.0155, + "num_tokens": 67441011.0, + "reward": 3.966932773590088, + "reward_std": 0.18705597519874573, + "rewards/reward_fn/mean": 3.966932773590088, + "rewards/reward_fn/std": 0.1870560199022293, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 364.6875, + "completions/mean_terminated_length": 364.6875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.15519253208868145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.026140006259083748, + "learning_rate": 7.415199999999999e-06, + "loss": -0.0479, + "num_tokens": 67468105.0, + "reward": 2.71226167678833, + "reward_std": 0.1927633434534073, + "rewards/reward_fn/mean": 2.71226167678833, + "rewards/reward_fn/std": 0.1927633434534073, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.15529861037445636, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.03174735209904611, + "learning_rate": 7.414799999999999e-06, + "loss": -0.0027, + "num_tokens": 67493285.0, + "reward": 2.8976378440856934, + "reward_std": 0.04806054010987282, + "rewards/reward_fn/mean": 2.8976378440856934, + "rewards/reward_fn/std": 0.04806055501103401, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 191.15625, + "completions/mean_terminated_length": 191.15625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.15540468866023124, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.027835983550176024, + "learning_rate": 7.4144e-06, + "loss": 0.0278, + "num_tokens": 67536010.0, + "reward": 3.7838172912597656, + "reward_std": 0.38195741176605225, + "rewards/reward_fn/mean": 3.7838172912597656, + "rewards/reward_fn/std": 0.38195741176605225, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 191.9375, + "completions/mean_terminated_length": 191.9375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.15551076694600616, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "kl": 0.03196986531838775, + "learning_rate": 7.414e-06, + "loss": 0.1454, + "num_tokens": 67579784.0, + "reward": 3.9290502071380615, + "reward_std": 0.4013527035713196, + "rewards/reward_fn/mean": 3.9290502071380615, + "rewards/reward_fn/std": 0.4013526737689972, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 279.4375, + "completions/mean_terminated_length": 279.4375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.15561684523178104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.02996737160719931, + "learning_rate": 7.4136e-06, + "loss": -0.0289, + "num_tokens": 67628854.0, + "reward": 3.9293787479400635, + "reward_std": 0.3994941711425781, + "rewards/reward_fn/mean": 3.9293787479400635, + "rewards/reward_fn/std": 0.3994941711425781, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 226.03125, + "completions/mean_terminated_length": 226.03125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.15572292351755596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.02296249126084149, + "learning_rate": 7.4132e-06, + "loss": 0.0009, + "num_tokens": 67658199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 243.71875, + "completions/mean_terminated_length": 243.71875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.15582900180333087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.026919973781332374, + "learning_rate": 7.4127999999999996e-06, + "loss": 0.0093, + "num_tokens": 67711342.0, + "reward": 2.8784685134887695, + "reward_std": 0.3173023462295532, + "rewards/reward_fn/mean": 2.8784685134887695, + "rewards/reward_fn/std": 0.31730228662490845, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 220.40625, + "completions/mean_terminated_length": 220.40625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.15593508008910575, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.025421129539608955, + "learning_rate": 7.4123999999999995e-06, + "loss": 0.0564, + "num_tokens": 67759163.0, + "reward": 2.9090018272399902, + "reward_std": 0.3549058735370636, + "rewards/reward_fn/mean": 2.9090018272399902, + "rewards/reward_fn/std": 0.3549058437347412, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 239.5625, + "completions/mean_terminated_length": 239.5625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.15604115837488067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.03431223169900477, + "learning_rate": 7.4119999999999995e-06, + "loss": 0.0014, + "num_tokens": 67809389.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 169.40625, + "completions/mean_terminated_length": 169.40625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.15614723666065555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.017876636935397983, + "learning_rate": 7.4115999999999995e-06, + "loss": 0.0007, + "num_tokens": 67834682.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 197.8125, + "completions/mean_terminated_length": 197.8125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.15625331494643047, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.026579777244478464, + "learning_rate": 7.4111999999999994e-06, + "loss": -0.0256, + "num_tokens": 67882676.0, + "reward": 3.851898670196533, + "reward_std": 0.4995054602622986, + "rewards/reward_fn/mean": 3.851898670196533, + "rewards/reward_fn/std": 0.49950549006462097, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 230.40625, + "completions/mean_terminated_length": 230.40625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.15635939323220538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.02615373209118843, + "learning_rate": 7.410799999999999e-06, + "loss": 0.001, + "num_tokens": 67922849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 175.96875, + "completions/mean_terminated_length": 175.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.15646547151798026, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.024530835915356874, + "learning_rate": 7.410399999999999e-06, + "loss": -0.0158, + "num_tokens": 67964992.0, + "reward": 3.9716320037841797, + "reward_std": 0.1604730784893036, + "rewards/reward_fn/mean": 3.9716320037841797, + "rewards/reward_fn/std": 0.1604730784893036, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 182.84375, + "completions/mean_terminated_length": 182.84375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.15657154980375518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.021006754599511623, + "learning_rate": 7.41e-06, + "loss": 0.0792, + "num_tokens": 68009211.0, + "reward": 3.662135362625122, + "reward_std": 0.509412944316864, + "rewards/reward_fn/mean": 3.662135362625122, + "rewards/reward_fn/std": 0.509412944316864, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.15667762808953006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.023102863458916545, + "learning_rate": 7.4096e-06, + "loss": 0.0009, + "num_tokens": 68057541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 308.875, + "completions/mean_terminated_length": 308.875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.15678370637530498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.027743780752643943, + "learning_rate": 7.4092e-06, + "loss": 0.0634, + "num_tokens": 68103777.0, + "reward": 3.5075292587280273, + "reward_std": 0.9456666707992554, + "rewards/reward_fn/mean": 3.5075292587280273, + "rewards/reward_fn/std": 0.9456667304039001, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 574.75, + "completions/mean_terminated_length": 527.2257690429688, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.1568897846610799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.02394183212891221, + "learning_rate": 7.4088e-06, + "loss": 0.1843, + "num_tokens": 68157337.0, + "reward": 3.4912631511688232, + "reward_std": 0.9075994491577148, + "rewards/reward_fn/mean": 3.4912631511688232, + "rewards/reward_fn/std": 0.9075994491577148, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 253.9375, + "completions/mean_terminated_length": 253.9375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.15699586294685478, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.036285872804000974, + "learning_rate": 7.4084e-06, + "loss": -0.0445, + "num_tokens": 68200823.0, + "reward": 3.738661766052246, + "reward_std": 0.4272725582122803, + "rewards/reward_fn/mean": 3.738661766052246, + "rewards/reward_fn/std": 0.4272725582122803, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 310.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1571019412326297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.021707464940845966, + "learning_rate": 7.408e-06, + "loss": 0.1426, + "num_tokens": 68248127.0, + "reward": 2.7692253589630127, + "reward_std": 0.04598098248243332, + "rewards/reward_fn/mean": 2.7692253589630127, + "rewards/reward_fn/std": 0.04598100483417511, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.15720801951840457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.025276067899540067, + "learning_rate": 7.4076e-06, + "loss": 0.001, + "num_tokens": 68301793.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 295.5625, + "completions/mean_terminated_length": 295.5625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.1573140978041795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0196508695371449, + "learning_rate": 7.407199999999999e-06, + "loss": 0.0384, + "num_tokens": 68347571.0, + "reward": 3.862781047821045, + "reward_std": 0.43722283840179443, + "rewards/reward_fn/mean": 3.862781047821045, + "rewards/reward_fn/std": 0.43722283840179443, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1574201760899544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.031088900519534945, + "learning_rate": 7.406799999999999e-06, + "loss": 0.0976, + "num_tokens": 68400873.0, + "reward": 2.848465919494629, + "reward_std": 0.06664532423019409, + "rewards/reward_fn/mean": 2.848465919494629, + "rewards/reward_fn/std": 0.0666453167796135, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 222.40625, + "completions/mean_terminated_length": 222.40625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.15752625437572929, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.02430442371405661, + "learning_rate": 7.406399999999999e-06, + "loss": -0.0518, + "num_tokens": 68437174.0, + "reward": 3.611813545227051, + "reward_std": 0.4808964431285858, + "rewards/reward_fn/mean": 3.611813545227051, + "rewards/reward_fn/std": 0.4808965027332306, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1996.0, + "completions/mean_length": 783.9375, + "completions/mean_terminated_length": 653.1724243164062, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.1576323326615042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.020748317008838058, + "learning_rate": 7.405999999999999e-06, + "loss": 0.353, + "num_tokens": 68507156.0, + "reward": 2.5030746459960938, + "reward_std": 0.7754137516021729, + "rewards/reward_fn/mean": 2.5030746459960938, + "rewards/reward_fn/std": 0.7754136919975281, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 430.71875, + "completions/mean_terminated_length": 378.5483703613281, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.15773841094727908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.027423285646364093, + "learning_rate": 7.405599999999999e-06, + "loss": 0.213, + "num_tokens": 68565451.0, + "reward": 3.366755247116089, + "reward_std": 0.8312036991119385, + "rewards/reward_fn/mean": 3.366755247116089, + "rewards/reward_fn/std": 0.8312036991119385, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1758.0, + "completions/max_terminated_length": 1758.0, + "completions/mean_length": 336.375, + "completions/mean_terminated_length": 336.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.157844489233054, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.03361711511388421, + "learning_rate": 7.4052e-06, + "loss": 0.1764, + "num_tokens": 68633303.0, + "reward": 3.351813316345215, + "reward_std": 0.7536391615867615, + "rewards/reward_fn/mean": 3.351813316345215, + "rewards/reward_fn/std": 0.7536391615867615, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 228.21875, + "completions/mean_terminated_length": 228.21875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.15795056751882888, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.032002222491428256, + "learning_rate": 7.4048e-06, + "loss": -0.0099, + "num_tokens": 68671102.0, + "reward": 3.9690937995910645, + "reward_std": 0.17483150959014893, + "rewards/reward_fn/mean": 3.9690937995910645, + "rewards/reward_fn/std": 0.17483149468898773, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 76.8125, + "completions/mean_terminated_length": 76.8125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1580566458046038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.014173948089592159, + "learning_rate": 7.4044e-06, + "loss": 0.0006, + "num_tokens": 68704120.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 184.40625, + "completions/mean_terminated_length": 184.40625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.1581627240903787, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.027895437320694327, + "learning_rate": 7.404e-06, + "loss": -0.071, + "num_tokens": 68741221.0, + "reward": 3.5616817474365234, + "reward_std": 0.7033450603485107, + "rewards/reward_fn/mean": 3.5616817474365234, + "rewards/reward_fn/std": 0.7033450603485107, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 323.90625, + "completions/mean_terminated_length": 323.90625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.1582688023761536, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.029311693971976638, + "learning_rate": 7.4036e-06, + "loss": 0.118, + "num_tokens": 68787522.0, + "reward": 2.7153244018554688, + "reward_std": 0.4882570505142212, + "rewards/reward_fn/mean": 2.7153244018554688, + "rewards/reward_fn/std": 0.4882570505142212, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 331.65625, + "completions/mean_terminated_length": 331.65625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.1583748806619285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.024782405234873295, + "learning_rate": 7.4032e-06, + "loss": -0.0318, + "num_tokens": 68835991.0, + "reward": 2.7767367362976074, + "reward_std": 0.3278542459011078, + "rewards/reward_fn/mean": 2.7767367362976074, + "rewards/reward_fn/std": 0.3278542757034302, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 321.53125, + "completions/mean_terminated_length": 321.53125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.1584809589477034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.030189570039510727, + "learning_rate": 7.4028e-06, + "loss": 0.209, + "num_tokens": 68878216.0, + "reward": 2.947617292404175, + "reward_std": 0.026048338040709496, + "rewards/reward_fn/mean": 2.947617292404175, + "rewards/reward_fn/std": 0.026048310101032257, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 236.3125, + "completions/mean_terminated_length": 236.3125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.1585870372334783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.023946196772158146, + "learning_rate": 7.4024e-06, + "loss": 0.001, + "num_tokens": 68921106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 162.21875, + "completions/mean_terminated_length": 162.21875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.15869311551925322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.026347257429733872, + "learning_rate": 7.402e-06, + "loss": 0.0011, + "num_tokens": 68962841.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 242.6875, + "completions/mean_terminated_length": 242.6875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.1587991938050281, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.026496655773371458, + "learning_rate": 7.4015999999999996e-06, + "loss": -0.0276, + "num_tokens": 69005231.0, + "reward": 3.8105435371398926, + "reward_std": 0.4018127918243408, + "rewards/reward_fn/mean": 3.8105435371398926, + "rewards/reward_fn/std": 0.40181276202201843, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 312.21875, + "completions/mean_terminated_length": 312.21875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.15890527209080302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.021861796732991934, + "learning_rate": 7.4011999999999995e-06, + "loss": 0.0307, + "num_tokens": 69063862.0, + "reward": 2.7844386100769043, + "reward_std": 1.1609641313552856, + "rewards/reward_fn/mean": 2.7844386100769043, + "rewards/reward_fn/std": 1.160964012145996, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 708.375, + "completions/mean_terminated_length": 619.0667114257812, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.1590113503765779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.029261509189382195, + "learning_rate": 7.4007999999999995e-06, + "loss": 0.1809, + "num_tokens": 69123202.0, + "reward": 2.281765937805176, + "reward_std": 0.7985396385192871, + "rewards/reward_fn/mean": 2.281765937805176, + "rewards/reward_fn/std": 0.7985396385192871, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 333.03125, + "completions/mean_terminated_length": 333.03125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.15911742866235282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.0297744101844728, + "learning_rate": 7.4004e-06, + "loss": 0.0043, + "num_tokens": 69176387.0, + "reward": 2.7405807971954346, + "reward_std": 0.37365394830703735, + "rewards/reward_fn/mean": 2.7405807971954346, + "rewards/reward_fn/std": 0.37365394830703735, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 580.5625, + "completions/mean_terminated_length": 482.7333679199219, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.15922350694812773, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02315366081893444, + "learning_rate": 7.4e-06, + "loss": 0.381, + "num_tokens": 69237973.0, + "reward": 2.747545003890991, + "reward_std": 0.7242361307144165, + "rewards/reward_fn/mean": 2.747545003890991, + "rewards/reward_fn/std": 0.7242361307144165, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.15932958523390262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.208984375, + "kl": 0.024919069837778807, + "learning_rate": 7.3996e-06, + "loss": 0.001, + "num_tokens": 69265745.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 360.5625, + "completions/mean_terminated_length": 360.5625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.15943566351967753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.03635862981900573, + "learning_rate": 7.3992e-06, + "loss": -0.0237, + "num_tokens": 69309059.0, + "reward": 2.4815585613250732, + "reward_std": 0.4622640609741211, + "rewards/reward_fn/mean": 2.4815585613250732, + "rewards/reward_fn/std": 0.46226412057876587, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 277.4375, + "completions/mean_terminated_length": 277.4375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.15954174180545241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.022342822514474392, + "learning_rate": 7.398799999999999e-06, + "loss": 0.0009, + "num_tokens": 69362417.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 290.0, + "completions/mean_terminated_length": 290.0, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.15964782009122733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.025484284618869424, + "learning_rate": 7.398399999999999e-06, + "loss": 0.1144, + "num_tokens": 69405713.0, + "reward": 2.7833831310272217, + "reward_std": 0.04491547495126724, + "rewards/reward_fn/mean": 2.7833831310272217, + "rewards/reward_fn/std": 0.04491545632481575, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 210.4375, + "completions/mean_terminated_length": 210.4375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.15975389837700224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.02738275215961039, + "learning_rate": 7.397999999999999e-06, + "loss": -0.0175, + "num_tokens": 69449247.0, + "reward": 3.937028646469116, + "reward_std": 0.24817818403244019, + "rewards/reward_fn/mean": 3.937028646469116, + "rewards/reward_fn/std": 0.24817822873592377, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.15985997666277713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.02079133247025311, + "learning_rate": 7.397599999999999e-06, + "loss": 0.0008, + "num_tokens": 69498619.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 205.09375, + "completions/mean_terminated_length": 205.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.15996605494855204, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.031059396918863058, + "learning_rate": 7.397199999999999e-06, + "loss": 0.2326, + "num_tokens": 69547902.0, + "reward": 3.931945323944092, + "reward_std": 0.2682742774486542, + "rewards/reward_fn/mean": 3.931945323944092, + "rewards/reward_fn/std": 0.2682742774486542, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 348.375, + "completions/mean_terminated_length": 348.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.16007213323432692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.025881059700623155, + "learning_rate": 7.396799999999999e-06, + "loss": -0.0134, + "num_tokens": 69599018.0, + "reward": 2.5926716327667236, + "reward_std": 0.1885869950056076, + "rewards/reward_fn/mean": 2.5926716327667236, + "rewards/reward_fn/std": 0.1885869950056076, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 959.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.16017821152010184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.020898002781905234, + "learning_rate": 7.396399999999999e-06, + "loss": 0.0008, + "num_tokens": 69646190.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 298.71875, + "completions/mean_terminated_length": 298.71875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.16028428980587675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.02123245596885681, + "learning_rate": 7.395999999999999e-06, + "loss": -0.0262, + "num_tokens": 69696421.0, + "reward": 3.7587828636169434, + "reward_std": 0.6745774149894714, + "rewards/reward_fn/mean": 3.7587828636169434, + "rewards/reward_fn/std": 0.6745774745941162, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 360.75, + "completions/mean_terminated_length": 360.75, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.16039036809165164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.029440977377817035, + "learning_rate": 7.3956e-06, + "loss": 0.0316, + "num_tokens": 69741565.0, + "reward": 3.9266085624694824, + "reward_std": 0.41516539454460144, + "rewards/reward_fn/mean": 3.9266085624694824, + "rewards/reward_fn/std": 0.41516542434692383, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 177.0625, + "completions/mean_terminated_length": 177.0625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.16049644637742655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.015543692628853023, + "learning_rate": 7.3952e-06, + "loss": -0.0021, + "num_tokens": 69788575.0, + "reward": 3.473254680633545, + "reward_std": 0.8447170853614807, + "rewards/reward_fn/mean": 3.473254680633545, + "rewards/reward_fn/std": 0.8447170853614807, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 410.625, + "completions/mean_terminated_length": 410.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.16060252466320143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.0255625550635159, + "learning_rate": 7.3948e-06, + "loss": 0.1049, + "num_tokens": 69835827.0, + "reward": 2.6098246574401855, + "reward_std": 0.272195965051651, + "rewards/reward_fn/mean": 2.6098246574401855, + "rewards/reward_fn/std": 0.272195965051651, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 227.625, + "completions/mean_terminated_length": 227.625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.16070860294897635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.0203583559487015, + "learning_rate": 7.3944e-06, + "loss": 0.0008, + "num_tokens": 69890279.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 176.90625, + "completions/mean_terminated_length": 176.90625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.16081468123475123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.019284927984699607, + "learning_rate": 7.394e-06, + "loss": 0.0008, + "num_tokens": 69928644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 166.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.16092075952052615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.05426881415769458, + "learning_rate": 7.3936e-06, + "loss": 0.0055, + "num_tokens": 69978820.0, + "reward": 3.9703755378723145, + "reward_std": 0.16758133471012115, + "rewards/reward_fn/mean": 3.9703755378723145, + "rewards/reward_fn/std": 0.16758134961128235, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 452.96875, + "completions/mean_terminated_length": 401.51611328125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.16102683780630106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.02300673839636147, + "learning_rate": 7.3932e-06, + "loss": 0.2649, + "num_tokens": 70017987.0, + "reward": 2.9328155517578125, + "reward_std": 0.6929539442062378, + "rewards/reward_fn/mean": 2.9328155517578125, + "rewards/reward_fn/std": 0.6929539442062378, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 86.5625, + "completions/mean_terminated_length": 86.5625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.16113291609207595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.013300622056704015, + "learning_rate": 7.3928e-06, + "loss": 0.0005, + "num_tokens": 70041269.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.16123899437785086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.020807479857467115, + "learning_rate": 7.3924e-06, + "loss": 0.0008, + "num_tokens": 70104513.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.16134507266362574, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.017212234903126955, + "learning_rate": 7.392e-06, + "loss": 0.1497, + "num_tokens": 70155629.0, + "reward": 2.998427629470825, + "reward_std": 0.039904408156871796, + "rewards/reward_fn/mean": 2.998427629470825, + "rewards/reward_fn/std": 0.039904408156871796, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.16145115094940066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.027058127569034696, + "learning_rate": 7.3916e-06, + "loss": 0.0011, + "num_tokens": 70211559.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 404.4375, + "completions/mean_terminated_length": 404.4375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.16155722923517557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02585031627677381, + "learning_rate": 7.3912000000000005e-06, + "loss": -0.0859, + "num_tokens": 70285653.0, + "reward": 3.6108155250549316, + "reward_std": 0.5108808875083923, + "rewards/reward_fn/mean": 3.6108155250549316, + "rewards/reward_fn/std": 0.5108808279037476, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.16166330752095046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.021037982078269124, + "learning_rate": 7.3908e-06, + "loss": -0.0172, + "num_tokens": 70327169.0, + "reward": 3.0207977294921875, + "reward_std": 0.037842877209186554, + "rewards/reward_fn/mean": 3.0207977294921875, + "rewards/reward_fn/std": 0.03784283623099327, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 102.78125, + "completions/mean_terminated_length": 102.78125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.16176938580672537, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.02766521042212844, + "learning_rate": 7.3904e-06, + "loss": -0.0696, + "num_tokens": 70366170.0, + "reward": 3.8455190658569336, + "reward_std": 0.3647652268409729, + "rewards/reward_fn/mean": 3.8455190658569336, + "rewards/reward_fn/std": 0.3647651970386505, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 192.96875, + "completions/mean_terminated_length": 192.96875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.16187546409250025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.020648099714890122, + "learning_rate": 7.3899999999999995e-06, + "loss": 0.0008, + "num_tokens": 70402553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 141.09375, + "completions/mean_terminated_length": 141.09375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.16198154237827517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.02335872733965516, + "learning_rate": 7.3895999999999995e-06, + "loss": 0.0009, + "num_tokens": 70447708.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 201.6875, + "completions/mean_terminated_length": 201.6875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.16208762066405008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.02256969455629587, + "learning_rate": 7.3891999999999995e-06, + "loss": 0.0009, + "num_tokens": 70483954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1466.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 353.5625, + "completions/mean_terminated_length": 353.5625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.16219369894982497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.042810263112187386, + "learning_rate": 7.3887999999999995e-06, + "loss": 0.0261, + "num_tokens": 70534692.0, + "reward": 2.9112634658813477, + "reward_std": 0.054259590804576874, + "rewards/reward_fn/mean": 2.9112634658813477, + "rewards/reward_fn/std": 0.054259564727544785, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 203.75, + "completions/mean_terminated_length": 203.75, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.16229977723559988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.027603084221482277, + "learning_rate": 7.3883999999999994e-06, + "loss": 0.0011, + "num_tokens": 70570908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 443.78125, + "completions/mean_terminated_length": 443.78125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.16240585552137476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0299779511988163, + "learning_rate": 7.387999999999999e-06, + "loss": 0.0348, + "num_tokens": 70638805.0, + "reward": 2.931946277618408, + "reward_std": 0.0676136463880539, + "rewards/reward_fn/mean": 2.931946277618408, + "rewards/reward_fn/std": 0.06761366128921509, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1183.0, + "completions/max_terminated_length": 1183.0, + "completions/mean_length": 322.625, + "completions/mean_terminated_length": 322.625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.16251193380714968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.02347099781036377, + "learning_rate": 7.387599999999999e-06, + "loss": 0.0192, + "num_tokens": 70693225.0, + "reward": 3.963925838470459, + "reward_std": 0.2040664255619049, + "rewards/reward_fn/mean": 3.963925838470459, + "rewards/reward_fn/std": 0.2040664404630661, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 175.59375, + "completions/mean_terminated_length": 175.59375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.1626180120929246, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.028762807371094823, + "learning_rate": 7.387199999999999e-06, + "loss": 0.0637, + "num_tokens": 70734460.0, + "reward": 3.963350296020508, + "reward_std": 0.20732258260250092, + "rewards/reward_fn/mean": 3.963350296020508, + "rewards/reward_fn/std": 0.20732256770133972, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 266.03125, + "completions/mean_terminated_length": 266.03125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.16272409037869948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.018660985631868243, + "learning_rate": 7.386799999999999e-06, + "loss": 0.0007, + "num_tokens": 70788509.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 164.75, + "completions/mean_terminated_length": 164.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.1628301686644744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.027147594606503844, + "learning_rate": 7.3864e-06, + "loss": 0.1217, + "num_tokens": 70835989.0, + "reward": 3.89919376373291, + "reward_std": 0.3184683322906494, + "rewards/reward_fn/mean": 3.89919376373291, + "rewards/reward_fn/std": 0.318468302488327, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 92.625, + "completions/mean_terminated_length": 92.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.16293624695024927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.01960705651436001, + "learning_rate": 7.386e-06, + "loss": 0.0008, + "num_tokens": 70873065.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 145.71875, + "completions/mean_terminated_length": 145.71875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.1630423252360242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.019639571546576917, + "learning_rate": 7.3856e-06, + "loss": 0.0008, + "num_tokens": 70920544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 153.40625, + "completions/mean_terminated_length": 153.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.1631484035217991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.021374219097197056, + "learning_rate": 7.3852e-06, + "loss": 0.0009, + "num_tokens": 70947181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 327.65625, + "completions/mean_terminated_length": 327.65625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.163254481807574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.030522728571668267, + "learning_rate": 7.3848e-06, + "loss": 0.0377, + "num_tokens": 70987554.0, + "reward": 2.9481630325317383, + "reward_std": 0.22395570576190948, + "rewards/reward_fn/mean": 2.9481630325317383, + "rewards/reward_fn/std": 0.22395570576190948, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 466.15625, + "completions/mean_terminated_length": 466.15625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.1633605600933489, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.029136600205674767, + "learning_rate": 7.3844e-06, + "loss": 0.047, + "num_tokens": 71050631.0, + "reward": 2.643458366394043, + "reward_std": 0.3626547157764435, + "rewards/reward_fn/mean": 2.643458366394043, + "rewards/reward_fn/std": 0.3626546859741211, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 280.8125, + "completions/mean_terminated_length": 280.8125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.16346663837912379, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.026954283006489277, + "learning_rate": 7.384e-06, + "loss": 0.0118, + "num_tokens": 71082049.0, + "reward": 3.7240138053894043, + "reward_std": 0.44852492213249207, + "rewards/reward_fn/mean": 3.7240138053894043, + "rewards/reward_fn/std": 0.4485248923301697, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1465.0, + "completions/max_terminated_length": 1465.0, + "completions/mean_length": 354.84375, + "completions/mean_terminated_length": 354.84375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.1635727166648987, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.53515625, + "kl": 0.025874186540022492, + "learning_rate": 7.3836e-06, + "loss": -0.1326, + "num_tokens": 71132028.0, + "reward": 2.750469207763672, + "reward_std": 0.20693431794643402, + "rewards/reward_fn/mean": 2.750469207763672, + "rewards/reward_fn/std": 0.20693430304527283, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 248.59375, + "completions/mean_terminated_length": 248.59375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.16367879495067358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.0319938138127327, + "learning_rate": 7.3832e-06, + "loss": 0.0013, + "num_tokens": 71177263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1375.0, + "completions/mean_length": 798.59375, + "completions/mean_terminated_length": 758.290283203125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.1637848732364485, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.021971626905724406, + "learning_rate": 7.382799999999999e-06, + "loss": 0.1853, + "num_tokens": 71245058.0, + "reward": 2.5030364990234375, + "reward_std": 0.6058197021484375, + "rewards/reward_fn/mean": 2.5030364990234375, + "rewards/reward_fn/std": 0.6058197021484375, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 197.3125, + "completions/mean_terminated_length": 197.3125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1638909515222234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.020507124136202037, + "learning_rate": 7.382399999999999e-06, + "loss": 0.0008, + "num_tokens": 71277932.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 262.3125, + "completions/mean_terminated_length": 262.3125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.1639970298079983, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.027538377791643143, + "learning_rate": 7.381999999999999e-06, + "loss": 0.0391, + "num_tokens": 71344630.0, + "reward": 2.704584836959839, + "reward_std": 0.04232628643512726, + "rewards/reward_fn/mean": 2.704584836959839, + "rewards/reward_fn/std": 0.042326249182224274, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 497.6875, + "completions/mean_terminated_length": 447.6773986816406, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1641031080937732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.03232635045424104, + "learning_rate": 7.3816e-06, + "loss": 0.0815, + "num_tokens": 71395532.0, + "reward": 2.349479913711548, + "reward_std": 0.6918059587478638, + "rewards/reward_fn/mean": 2.349479913711548, + "rewards/reward_fn/std": 0.6918059587478638, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 255.40625, + "completions/mean_terminated_length": 255.40625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1642091863795481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.018391662510111928, + "learning_rate": 7.3812e-06, + "loss": 0.0007, + "num_tokens": 71460441.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 456.8125, + "completions/mean_terminated_length": 456.8125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.164315264665323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02743516000919044, + "learning_rate": 7.3808e-06, + "loss": 0.0942, + "num_tokens": 71514451.0, + "reward": 3.201913356781006, + "reward_std": 0.6840846538543701, + "rewards/reward_fn/mean": 3.201913356781006, + "rewards/reward_fn/std": 0.6840846538543701, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 287.1875, + "completions/mean_terminated_length": 287.1875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.16442134295109792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.02359214937314391, + "learning_rate": 7.3804e-06, + "loss": 0.0009, + "num_tokens": 71560633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1822.0, + "completions/max_terminated_length": 1822.0, + "completions/mean_length": 379.65625, + "completions/mean_terminated_length": 379.65625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.1645274212368728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.027676088735461235, + "learning_rate": 7.38e-06, + "loss": 0.0507, + "num_tokens": 71621486.0, + "reward": 2.9698777198791504, + "reward_std": 0.056896451860666275, + "rewards/reward_fn/mean": 2.9698777198791504, + "rewards/reward_fn/std": 0.05689648166298866, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 294.40625, + "completions/mean_terminated_length": 294.40625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.16463349952264772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.022418924141675234, + "learning_rate": 7.3796e-06, + "loss": 0.0377, + "num_tokens": 71668123.0, + "reward": 2.8232269287109375, + "reward_std": 0.2750570774078369, + "rewards/reward_fn/mean": 2.8232269287109375, + "rewards/reward_fn/std": 0.2750571072101593, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 321.78125, + "completions/mean_terminated_length": 321.78125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.1647395778084226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.030020848847925663, + "learning_rate": 7.3792e-06, + "loss": 0.073, + "num_tokens": 71715380.0, + "reward": 3.7270565032958984, + "reward_std": 0.5574728846549988, + "rewards/reward_fn/mean": 3.7270565032958984, + "rewards/reward_fn/std": 0.5574728846549988, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 282.1875, + "completions/mean_terminated_length": 282.1875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.16484565609419752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.01917832251638174, + "learning_rate": 7.3787999999999996e-06, + "loss": -0.0677, + "num_tokens": 71762618.0, + "reward": 3.861374616622925, + "reward_std": 0.37316587567329407, + "rewards/reward_fn/mean": 3.861374616622925, + "rewards/reward_fn/std": 0.37316587567329407, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 219.625, + "completions/mean_terminated_length": 219.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.16495173437997243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.028847611974924803, + "learning_rate": 7.3783999999999995e-06, + "loss": -0.0166, + "num_tokens": 71807982.0, + "reward": 3.9281513690948486, + "reward_std": 0.40643757581710815, + "rewards/reward_fn/mean": 3.9281513690948486, + "rewards/reward_fn/std": 0.40643760561943054, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 321.25, + "completions/mean_terminated_length": 321.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.16505781266574732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.03293606429360807, + "learning_rate": 7.3779999999999995e-06, + "loss": 0.1044, + "num_tokens": 71862006.0, + "reward": 2.8002994060516357, + "reward_std": 0.053148169070482254, + "rewards/reward_fn/mean": 2.8002994060516357, + "rewards/reward_fn/std": 0.05314814671874046, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1651.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 360.21875, + "completions/mean_terminated_length": 360.21875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.16516389095152223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.031179531943053007, + "learning_rate": 7.3775999999999995e-06, + "loss": 0.0442, + "num_tokens": 71911549.0, + "reward": 2.779324531555176, + "reward_std": 0.4085378348827362, + "rewards/reward_fn/mean": 2.779324531555176, + "rewards/reward_fn/std": 0.4085378348827362, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 500.25, + "completions/mean_terminated_length": 450.32257080078125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.16526996923729712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.027880383422598243, + "learning_rate": 7.3772e-06, + "loss": 0.2164, + "num_tokens": 71978277.0, + "reward": 2.6912436485290527, + "reward_std": 0.5629785060882568, + "rewards/reward_fn/mean": 2.6912436485290527, + "rewards/reward_fn/std": 0.5629785060882568, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.16537604752307203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.025317950639873743, + "learning_rate": 7.3768e-06, + "loss": -0.0064, + "num_tokens": 72020922.0, + "reward": 3.0498218536376953, + "reward_std": 0.0369083546102047, + "rewards/reward_fn/mean": 3.0498218536376953, + "rewards/reward_fn/std": 0.0369083397090435, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 262.90625, + "completions/mean_terminated_length": 262.90625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.16548212580884694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.02574088191613555, + "learning_rate": 7.3764e-06, + "loss": -0.0193, + "num_tokens": 72049975.0, + "reward": 3.622433662414551, + "reward_std": 0.8289299607276917, + "rewards/reward_fn/mean": 3.622433662414551, + "rewards/reward_fn/std": 0.8289299607276917, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 152.625, + "completions/mean_terminated_length": 152.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.16558820409462183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.021092961658723652, + "learning_rate": 7.376e-06, + "loss": 0.0008, + "num_tokens": 72086347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 124.40625, + "completions/mean_terminated_length": 124.40625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.16569428238039674, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.029914353508502245, + "learning_rate": 7.3756e-06, + "loss": 0.1327, + "num_tokens": 72125208.0, + "reward": 2.8506991863250732, + "reward_std": 0.03070419654250145, + "rewards/reward_fn/mean": 2.8506991863250732, + "rewards/reward_fn/std": 0.030704230070114136, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.16580036066617163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.03901820583269, + "learning_rate": 7.3752e-06, + "loss": 0.0406, + "num_tokens": 72173097.0, + "reward": 3.0672106742858887, + "reward_std": 0.3122633397579193, + "rewards/reward_fn/mean": 3.0672106742858887, + "rewards/reward_fn/std": 0.3122633397579193, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 160.21875, + "completions/mean_terminated_length": 160.21875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.16590643895194654, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.029396470403298736, + "learning_rate": 7.374799999999999e-06, + "loss": -0.0484, + "num_tokens": 72198128.0, + "reward": 3.639256477355957, + "reward_std": 0.5450024604797363, + "rewards/reward_fn/mean": 3.639256477355957, + "rewards/reward_fn/std": 0.5450024604797363, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.16601251723772145, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.05406060954555869, + "learning_rate": 7.374399999999999e-06, + "loss": 0.0989, + "num_tokens": 72225964.0, + "reward": 3.8368563652038574, + "reward_std": 0.3858475089073181, + "rewards/reward_fn/mean": 3.8368563652038574, + "rewards/reward_fn/std": 0.3858474791049957, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 372.65625, + "completions/mean_terminated_length": 372.65625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.16611859552349634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.022666738834232092, + "learning_rate": 7.373999999999999e-06, + "loss": -0.0341, + "num_tokens": 72285057.0, + "reward": 3.9279088973999023, + "reward_std": 0.40780818462371826, + "rewards/reward_fn/mean": 3.9279088973999023, + "rewards/reward_fn/std": 0.40780818462371826, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 389.0, + "completions/mean_terminated_length": 335.4838562011719, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.16622467380927125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.02747956058010459, + "learning_rate": 7.373599999999999e-06, + "loss": 0.154, + "num_tokens": 72340545.0, + "reward": 2.9696366786956787, + "reward_std": 0.44867783784866333, + "rewards/reward_fn/mean": 2.9696366786956787, + "rewards/reward_fn/std": 0.44867780804634094, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 209.875, + "completions/mean_terminated_length": 209.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.16633075209504614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.0265498380176723, + "learning_rate": 7.373199999999999e-06, + "loss": 0.0011, + "num_tokens": 72398941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 176.28125, + "completions/mean_terminated_length": 176.28125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.16643683038082105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.027999462094157934, + "learning_rate": 7.372799999999999e-06, + "loss": 0.0011, + "num_tokens": 72469094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 188.53125, + "completions/mean_terminated_length": 188.53125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.16654290866659593, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.028627387015148997, + "learning_rate": 7.3724e-06, + "loss": -0.1119, + "num_tokens": 72509143.0, + "reward": 2.8793563842773438, + "reward_std": 0.6584159135818481, + "rewards/reward_fn/mean": 2.8793563842773438, + "rewards/reward_fn/std": 0.6584158539772034, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.16664898695237085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.02936831465922296, + "learning_rate": 7.372e-06, + "loss": 0.098, + "num_tokens": 72537067.0, + "reward": 2.8143582344055176, + "reward_std": 0.03991476818919182, + "rewards/reward_fn/mean": 2.8143582344055176, + "rewards/reward_fn/std": 0.03991476073861122, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.16675506523814576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.027146983658894897, + "learning_rate": 7.3716e-06, + "loss": 0.0011, + "num_tokens": 72564003.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.16686114352392065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.023954558419063687, + "learning_rate": 7.3712e-06, + "loss": 0.001, + "num_tokens": 72630999.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 285.28125, + "completions/mean_terminated_length": 285.28125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.16696722180969556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.027458957163617015, + "learning_rate": 7.3708e-06, + "loss": 0.0287, + "num_tokens": 72683744.0, + "reward": 2.6126034259796143, + "reward_std": 0.7907987236976624, + "rewards/reward_fn/mean": 2.6126034259796143, + "rewards/reward_fn/std": 0.7907987236976624, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 1310.0, + "completions/mean_length": 438.21875, + "completions/mean_terminated_length": 438.21875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.16707330009547045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.026026516687124968, + "learning_rate": 7.3704e-06, + "loss": 0.114, + "num_tokens": 72730439.0, + "reward": 3.1812829971313477, + "reward_std": 0.5636266469955444, + "rewards/reward_fn/mean": 3.1812829971313477, + "rewards/reward_fn/std": 0.5636265873908997, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.16717937838124536, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.025642277905717492, + "learning_rate": 7.37e-06, + "loss": 0.0384, + "num_tokens": 72779370.0, + "reward": 3.8860883712768555, + "reward_std": 0.468019962310791, + "rewards/reward_fn/mean": 3.8860883712768555, + "rewards/reward_fn/std": 0.468019962310791, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 213.1875, + "completions/mean_terminated_length": 213.1875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.16728545666702027, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.03445283626206219, + "learning_rate": 7.3696e-06, + "loss": -0.2059, + "num_tokens": 72815856.0, + "reward": 3.205237865447998, + "reward_std": 0.4682815372943878, + "rewards/reward_fn/mean": 3.205237865447998, + "rewards/reward_fn/std": 0.46828150749206543, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1284.0, + "completions/max_terminated_length": 1284.0, + "completions/mean_length": 308.40625, + "completions/mean_terminated_length": 308.40625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.16739153495279516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.024654962122440338, + "learning_rate": 7.3692e-06, + "loss": 0.001, + "num_tokens": 72863901.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 313.21875, + "completions/mean_terminated_length": 313.21875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.16749761323857007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02547546150162816, + "learning_rate": 7.3688e-06, + "loss": 0.0737, + "num_tokens": 72922372.0, + "reward": 3.0208816528320312, + "reward_std": 0.756123423576355, + "rewards/reward_fn/mean": 3.0208816528320312, + "rewards/reward_fn/std": 0.7561233639717102, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 192.5625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.16760369152434496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.04026231449097395, + "learning_rate": 7.3684e-06, + "loss": -0.0245, + "num_tokens": 72963222.0, + "reward": 3.9631948471069336, + "reward_std": 0.2082015424966812, + "rewards/reward_fn/mean": 3.9631948471069336, + "rewards/reward_fn/std": 0.2082015424966812, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1534.0, + "completions/max_terminated_length": 1534.0, + "completions/mean_length": 387.25, + "completions/mean_terminated_length": 387.25, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.16770976981011987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.029226713813841343, + "learning_rate": 7.368e-06, + "loss": 0.0179, + "num_tokens": 73006558.0, + "reward": 2.7275261878967285, + "reward_std": 0.326748251914978, + "rewards/reward_fn/mean": 2.7275261878967285, + "rewards/reward_fn/std": 0.32674822211265564, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 127.5, + "completions/mean_terminated_length": 127.5, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.16781584809589478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.03317599557340145, + "learning_rate": 7.3676e-06, + "loss": 0.0013, + "num_tokens": 73053486.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 281.9375, + "completions/mean_terminated_length": 281.9375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.16792192638166967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.022451504366472363, + "learning_rate": 7.3672e-06, + "loss": 0.0009, + "num_tokens": 73086188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 163.0625, + "completions/mean_terminated_length": 163.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.16802800466744458, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.026922843884676695, + "learning_rate": 7.3667999999999995e-06, + "loss": 0.0565, + "num_tokens": 73119246.0, + "reward": 2.860722064971924, + "reward_std": 0.0443354956805706, + "rewards/reward_fn/mean": 2.860722064971924, + "rewards/reward_fn/std": 0.04433548450469971, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1794.0, + "completions/max_terminated_length": 1794.0, + "completions/mean_length": 516.78125, + "completions/mean_terminated_length": 516.78125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.16813408295321947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.027608325239270926, + "learning_rate": 7.3663999999999995e-06, + "loss": 0.0582, + "num_tokens": 73182151.0, + "reward": 3.1305994987487793, + "reward_std": 0.9841459393501282, + "rewards/reward_fn/mean": 3.1305994987487793, + "rewards/reward_fn/std": 0.9841459393501282, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 214.09375, + "completions/mean_terminated_length": 214.09375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.16824016123899438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.02348411502316594, + "learning_rate": 7.3659999999999994e-06, + "loss": 0.0196, + "num_tokens": 73222634.0, + "reward": 3.9632420539855957, + "reward_std": 0.2079339474439621, + "rewards/reward_fn/mean": 3.9632420539855957, + "rewards/reward_fn/std": 0.2079339176416397, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 314.28125, + "completions/mean_terminated_length": 314.28125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1683462395247693, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.0289424117654562, + "learning_rate": 7.365599999999999e-06, + "loss": 0.1943, + "num_tokens": 73276947.0, + "reward": 3.456662178039551, + "reward_std": 0.8585535883903503, + "rewards/reward_fn/mean": 3.456662178039551, + "rewards/reward_fn/std": 0.8585535883903503, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 123.40625, + "completions/mean_terminated_length": 123.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.16845231781054418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.029007577802985907, + "learning_rate": 7.365199999999999e-06, + "loss": 0.0012, + "num_tokens": 73312480.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 364.125, + "completions/mean_terminated_length": 364.125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.1685583960963191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0202113245613873, + "learning_rate": 7.364799999999999e-06, + "loss": -0.0049, + "num_tokens": 73358404.0, + "reward": 3.0420591831207275, + "reward_std": 0.37027791142463684, + "rewards/reward_fn/mean": 3.0420591831207275, + "rewards/reward_fn/std": 0.37027788162231445, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 467.5, + "completions/mean_terminated_length": 416.51611328125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.16866447438209398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.019788505509495735, + "learning_rate": 7.364399999999999e-06, + "loss": 0.2435, + "num_tokens": 73411316.0, + "reward": 2.620427370071411, + "reward_std": 0.519374668598175, + "rewards/reward_fn/mean": 2.620427370071411, + "rewards/reward_fn/std": 0.519374668598175, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 339.25, + "completions/mean_terminated_length": 339.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.1687705526678689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.03387824585661292, + "learning_rate": 7.363999999999999e-06, + "loss": 0.0014, + "num_tokens": 73453916.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 276.1875, + "completions/mean_terminated_length": 276.1875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1688766309536438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.026950898114591837, + "learning_rate": 7.363599999999999e-06, + "loss": 0.0011, + "num_tokens": 73484098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 293.84375, + "completions/mean_terminated_length": 293.84375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.1689827092394187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0297443438321352, + "learning_rate": 7.363199999999999e-06, + "loss": 0.0417, + "num_tokens": 73524253.0, + "reward": 3.6159801483154297, + "reward_std": 0.506695032119751, + "rewards/reward_fn/mean": 3.6159801483154297, + "rewards/reward_fn/std": 0.506695032119751, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 434.375, + "completions/mean_terminated_length": 434.375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.1690887875251936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.022996835177764297, + "learning_rate": 7.3628e-06, + "loss": -0.0086, + "num_tokens": 73581993.0, + "reward": 3.1963114738464355, + "reward_std": 0.5918622612953186, + "rewards/reward_fn/mean": 3.1963114738464355, + "rewards/reward_fn/std": 0.5918623208999634, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 202.3125, + "completions/mean_terminated_length": 202.3125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.1691948658109685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.02059872355312109, + "learning_rate": 7.3624e-06, + "loss": 0.0376, + "num_tokens": 73628723.0, + "reward": 3.9325742721557617, + "reward_std": 0.3814173638820648, + "rewards/reward_fn/mean": 3.9325742721557617, + "rewards/reward_fn/std": 0.38141733407974243, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 228.5625, + "completions/mean_terminated_length": 228.5625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.1693009440967434, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.02786667738109827, + "learning_rate": 7.362e-06, + "loss": 0.04, + "num_tokens": 73666501.0, + "reward": 3.7850565910339355, + "reward_std": 0.5395143628120422, + "rewards/reward_fn/mean": 3.7850565910339355, + "rewards/reward_fn/std": 0.5395143628120422, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 337.75, + "completions/mean_terminated_length": 337.75, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.16940702238251829, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.02589184185490012, + "learning_rate": 7.3616e-06, + "loss": -0.0659, + "num_tokens": 73713885.0, + "reward": 2.7677502632141113, + "reward_std": 0.19476144015789032, + "rewards/reward_fn/mean": 2.7677502632141113, + "rewards/reward_fn/std": 0.1947614550590515, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 375.09375, + "completions/mean_terminated_length": 375.09375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1695131006682932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.01787739770952612, + "learning_rate": 7.3612e-06, + "loss": -0.0086, + "num_tokens": 73771200.0, + "reward": 3.8554279804229736, + "reward_std": 0.5688852071762085, + "rewards/reward_fn/mean": 3.8554279804229736, + "rewards/reward_fn/std": 0.5688852071762085, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 238.1875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.1696191789540681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.024122617673128843, + "learning_rate": 7.3608e-06, + "loss": -0.0122, + "num_tokens": 73835718.0, + "reward": 2.7904884815216064, + "reward_std": 0.028905630111694336, + "rewards/reward_fn/mean": 2.7904884815216064, + "rewards/reward_fn/std": 0.02890562266111374, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 207.09375, + "completions/mean_terminated_length": 207.09375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.169725257239843, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.03663892112672329, + "learning_rate": 7.3604e-06, + "loss": 0.0213, + "num_tokens": 73885321.0, + "reward": 2.8401732444763184, + "reward_std": 0.19660231471061707, + "rewards/reward_fn/mean": 2.8401732444763184, + "rewards/reward_fn/std": 0.19660234451293945, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 403.5, + "completions/mean_terminated_length": 350.45159912109375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.1698313355256179, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.02747776103205979, + "learning_rate": 7.36e-06, + "loss": 0.256, + "num_tokens": 73948569.0, + "reward": 2.654446601867676, + "reward_std": 0.600631058216095, + "rewards/reward_fn/mean": 2.654446601867676, + "rewards/reward_fn/std": 0.600631058216095, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 322.1875, + "completions/mean_terminated_length": 322.1875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.1699374138113928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.030739419627934694, + "learning_rate": 7.3596e-06, + "loss": 0.0544, + "num_tokens": 73980255.0, + "reward": 3.819499969482422, + "reward_std": 0.5079775452613831, + "rewards/reward_fn/mean": 3.819499969482422, + "rewards/reward_fn/std": 0.5079775452613831, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 140.1875, + "completions/mean_terminated_length": 140.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1700434920971677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.026381214149296284, + "learning_rate": 7.3592e-06, + "loss": 0.0011, + "num_tokens": 74014885.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 496.78125, + "completions/mean_terminated_length": 496.78125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.17014957038294262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.023547572316601872, + "learning_rate": 7.358799999999999e-06, + "loss": 0.017, + "num_tokens": 74071262.0, + "reward": 3.6152219772338867, + "reward_std": 0.6692531108856201, + "rewards/reward_fn/mean": 3.6152219772338867, + "rewards/reward_fn/std": 0.6692530512809753, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1870.0, + "completions/max_terminated_length": 1870.0, + "completions/mean_length": 533.1875, + "completions/mean_terminated_length": 533.1875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.1702556486687175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.02932202792726457, + "learning_rate": 7.358399999999999e-06, + "loss": -0.0112, + "num_tokens": 74124580.0, + "reward": 2.6716699600219727, + "reward_std": 0.3422979414463043, + "rewards/reward_fn/mean": 2.6716699600219727, + "rewards/reward_fn/std": 0.34229791164398193, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 226.53125, + "completions/mean_terminated_length": 226.53125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.17036172695449242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.02531708194874227, + "learning_rate": 7.358e-06, + "loss": 0.0259, + "num_tokens": 74148757.0, + "reward": 2.9681363105773926, + "reward_std": 0.04596217721700668, + "rewards/reward_fn/mean": 2.9681363105773926, + "rewards/reward_fn/std": 0.04596218094229698, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2008.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 379.90625, + "completions/mean_terminated_length": 379.90625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.1704678052402673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0387666889000684, + "learning_rate": 7.3576e-06, + "loss": -0.0632, + "num_tokens": 74189490.0, + "reward": 2.6473255157470703, + "reward_std": 0.4680839478969574, + "rewards/reward_fn/mean": 2.6473255157470703, + "rewards/reward_fn/std": 0.468083918094635, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.17057388352604222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.02067621098831296, + "learning_rate": 7.3572e-06, + "loss": 0.0394, + "num_tokens": 74237422.0, + "reward": 3.9700491428375244, + "reward_std": 0.16942748427391052, + "rewards/reward_fn/mean": 3.9700491428375244, + "rewards/reward_fn/std": 0.16942746937274933, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 267.5625, + "completions/mean_terminated_length": 267.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.17067996181181713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33984375, + "kl": 0.03733672644011676, + "learning_rate": 7.3568e-06, + "loss": 0.0015, + "num_tokens": 74283328.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 193.03125, + "completions/mean_terminated_length": 193.03125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.17078604009759202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.028381220996379852, + "learning_rate": 7.3563999999999996e-06, + "loss": 0.0011, + "num_tokens": 74317025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 217.75, + "completions/mean_terminated_length": 217.75, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.17089211838336693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.01850617292802781, + "learning_rate": 7.3559999999999995e-06, + "loss": 0.0007, + "num_tokens": 74370809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 328.8125, + "completions/mean_terminated_length": 328.8125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.17099819666914182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.025823280215263367, + "learning_rate": 7.3555999999999995e-06, + "loss": 0.087, + "num_tokens": 74432435.0, + "reward": 3.2880189418792725, + "reward_std": 0.9963902831077576, + "rewards/reward_fn/mean": 3.2880189418792725, + "rewards/reward_fn/std": 0.996390163898468, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 190.53125, + "completions/mean_terminated_length": 190.53125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.17110427495491673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.03128618444316089, + "learning_rate": 7.3551999999999995e-06, + "loss": 0.0013, + "num_tokens": 74462436.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 391.59375, + "completions/mean_terminated_length": 391.59375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.17121035324069164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0252360668964684, + "learning_rate": 7.3547999999999994e-06, + "loss": 0.0505, + "num_tokens": 74505847.0, + "reward": 2.924363136291504, + "reward_std": 0.3568187355995178, + "rewards/reward_fn/mean": 2.924363136291504, + "rewards/reward_fn/std": 0.3568187355995178, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 415.875, + "completions/mean_terminated_length": 415.875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.17131643152646653, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.024169984506443143, + "learning_rate": 7.354399999999999e-06, + "loss": -0.0145, + "num_tokens": 74576595.0, + "reward": 2.773521900177002, + "reward_std": 0.06824694573879242, + "rewards/reward_fn/mean": 2.773521900177002, + "rewards/reward_fn/std": 0.06824696063995361, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 470.65625, + "completions/mean_terminated_length": 470.65625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.17142250981224144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01885635987855494, + "learning_rate": 7.353999999999999e-06, + "loss": 0.1074, + "num_tokens": 74630760.0, + "reward": 3.872809886932373, + "reward_std": 0.4551088809967041, + "rewards/reward_fn/mean": 3.872809886932373, + "rewards/reward_fn/std": 0.4551088511943817, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 561.8125, + "completions/mean_terminated_length": 561.8125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.17152858809801633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.02392402058467269, + "learning_rate": 7.3536e-06, + "loss": 0.0666, + "num_tokens": 74692962.0, + "reward": 2.882366895675659, + "reward_std": 0.08499280363321304, + "rewards/reward_fn/mean": 2.882366895675659, + "rewards/reward_fn/std": 0.08499278873205185, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 185.90625, + "completions/mean_terminated_length": 185.90625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.17163466638379124, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.027309770928695798, + "learning_rate": 7.3532e-06, + "loss": 0.0287, + "num_tokens": 74744191.0, + "reward": 3.217087507247925, + "reward_std": 0.46121397614479065, + "rewards/reward_fn/mean": 3.217087507247925, + "rewards/reward_fn/std": 0.46121400594711304, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 328.09375, + "completions/mean_terminated_length": 328.09375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.17174074466956615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.02444523060694337, + "learning_rate": 7.3528e-06, + "loss": 0.0277, + "num_tokens": 74776962.0, + "reward": 3.580929756164551, + "reward_std": 0.706642210483551, + "rewards/reward_fn/mean": 3.580929756164551, + "rewards/reward_fn/std": 0.7066422700881958, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 232.71875, + "completions/mean_terminated_length": 232.71875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.17184682295534104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.02058199269231409, + "learning_rate": 7.3524e-06, + "loss": 0.0334, + "num_tokens": 74814873.0, + "reward": 2.8619747161865234, + "reward_std": 0.30958980321884155, + "rewards/reward_fn/mean": 2.8619747161865234, + "rewards/reward_fn/std": 0.30958983302116394, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1276.0, + "completions/max_terminated_length": 1276.0, + "completions/mean_length": 184.8125, + "completions/mean_terminated_length": 184.8125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.17195290124111595, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.03645259817130864, + "learning_rate": 7.352e-06, + "loss": -0.0075, + "num_tokens": 74851827.0, + "reward": 3.96586012840271, + "reward_std": 0.19312410056591034, + "rewards/reward_fn/mean": 3.96586012840271, + "rewards/reward_fn/std": 0.19312405586242676, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 295.625, + "completions/mean_terminated_length": 295.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.17205897952689084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.03457627212628722, + "learning_rate": 7.3516e-06, + "loss": 0.03, + "num_tokens": 74890727.0, + "reward": 2.7471697330474854, + "reward_std": 0.29337078332901, + "rewards/reward_fn/mean": 2.7471697330474854, + "rewards/reward_fn/std": 0.2933708131313324, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 182.4375, + "completions/mean_terminated_length": 182.4375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.17216505781266575, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.025542156072333455, + "learning_rate": 7.3512e-06, + "loss": 0.1191, + "num_tokens": 74926453.0, + "reward": 3.7256574630737305, + "reward_std": 0.7376229763031006, + "rewards/reward_fn/mean": 3.7256574630737305, + "rewards/reward_fn/std": 0.7376229763031006, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 266.59375, + "completions/mean_terminated_length": 266.59375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.17227113609844064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.01520739821717143, + "learning_rate": 7.350799999999999e-06, + "loss": 0.0953, + "num_tokens": 74982056.0, + "reward": 2.95540714263916, + "reward_std": 0.2933211028575897, + "rewards/reward_fn/mean": 2.95540714263916, + "rewards/reward_fn/std": 0.29332107305526733, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 254.5, + "completions/mean_terminated_length": 254.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.17237721438421555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.030610437272116542, + "learning_rate": 7.350399999999999e-06, + "loss": -0.0358, + "num_tokens": 75024792.0, + "reward": 1.808734655380249, + "reward_std": 0.40144309401512146, + "rewards/reward_fn/mean": 1.808734655380249, + "rewards/reward_fn/std": 0.4014430642127991, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 326.96875, + "completions/mean_terminated_length": 326.96875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.17248329266999046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.02779210708104074, + "learning_rate": 7.349999999999999e-06, + "loss": 0.1173, + "num_tokens": 75070423.0, + "reward": 3.8114876747131348, + "reward_std": 0.44545185565948486, + "rewards/reward_fn/mean": 3.8114876747131348, + "rewards/reward_fn/std": 0.4454518258571625, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 374.28125, + "completions/mean_terminated_length": 374.28125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.17258937095576535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.023398922523483634, + "learning_rate": 7.349599999999999e-06, + "loss": 0.0652, + "num_tokens": 75116768.0, + "reward": 2.8657479286193848, + "reward_std": 0.4419115483760834, + "rewards/reward_fn/mean": 2.8657479286193848, + "rewards/reward_fn/std": 0.441911518573761, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 396.71875, + "completions/mean_terminated_length": 396.71875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.17269544924154026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.01942450739443302, + "learning_rate": 7.349199999999999e-06, + "loss": 0.0472, + "num_tokens": 75178231.0, + "reward": 3.8060476779937744, + "reward_std": 0.5507187247276306, + "rewards/reward_fn/mean": 3.8060476779937744, + "rewards/reward_fn/std": 0.5507186651229858, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 199.40625, + "completions/mean_terminated_length": 199.40625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.17280152752731515, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.02440522238612175, + "learning_rate": 7.3488e-06, + "loss": 0.1618, + "num_tokens": 75226660.0, + "reward": 3.931492328643799, + "reward_std": 0.22422750294208527, + "rewards/reward_fn/mean": 3.931492328643799, + "rewards/reward_fn/std": 0.22422751784324646, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 252.53125, + "completions/mean_terminated_length": 252.53125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.17290760581309006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.02909305295906961, + "learning_rate": 7.3484e-06, + "loss": -0.2047, + "num_tokens": 75274133.0, + "reward": 3.0297999382019043, + "reward_std": 0.32074329257011414, + "rewards/reward_fn/mean": 3.0297999382019043, + "rewards/reward_fn/std": 0.3207433819770813, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1720.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 567.96875, + "completions/mean_terminated_length": 567.96875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.17301368409886497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.022551232716068625, + "learning_rate": 7.348e-06, + "loss": 0.118, + "num_tokens": 75332532.0, + "reward": 2.790672779083252, + "reward_std": 0.6563798785209656, + "rewards/reward_fn/mean": 2.790672779083252, + "rewards/reward_fn/std": 0.6563798785209656, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 79.46875, + "completions/mean_terminated_length": 79.46875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.17311976238463986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.01943877385929227, + "learning_rate": 7.3476e-06, + "loss": 0.0008, + "num_tokens": 75374595.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 257.625, + "completions/mean_terminated_length": 257.625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.17322584067041477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.022583408746868372, + "learning_rate": 7.3472e-06, + "loss": 0.0228, + "num_tokens": 75411671.0, + "reward": 2.8476908206939697, + "reward_std": 0.033805277198553085, + "rewards/reward_fn/mean": 2.8476908206939697, + "rewards/reward_fn/std": 0.033805254846811295, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.17333191895618966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.018412835197523236, + "learning_rate": 7.3468e-06, + "loss": 0.0389, + "num_tokens": 75462937.0, + "reward": 2.860079288482666, + "reward_std": 0.06922987848520279, + "rewards/reward_fn/mean": 2.860079288482666, + "rewards/reward_fn/std": 0.06922990828752518, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 121.65625, + "completions/mean_terminated_length": 121.65625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.17343799724196457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.03100604098290205, + "learning_rate": 7.3464e-06, + "loss": 0.0012, + "num_tokens": 75501678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 123.15625, + "completions/mean_terminated_length": 123.15625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.17354407552773948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.02859083702787757, + "learning_rate": 7.346e-06, + "loss": 0.0011, + "num_tokens": 75537587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 249.875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.17365015381351437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.02171206660568714, + "learning_rate": 7.3456e-06, + "loss": -0.0201, + "num_tokens": 75581391.0, + "reward": 3.929537057876587, + "reward_std": 0.3985986113548279, + "rewards/reward_fn/mean": 3.929537057876587, + "rewards/reward_fn/std": 0.39859864115715027, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 135.90625, + "completions/mean_terminated_length": 135.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.17375623209928928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.027793261338956654, + "learning_rate": 7.3451999999999996e-06, + "loss": 0.0011, + "num_tokens": 75616940.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.17386231038506417, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.022352764382958412, + "learning_rate": 7.3447999999999995e-06, + "loss": 0.0293, + "num_tokens": 75670383.0, + "reward": 3.888477325439453, + "reward_std": 0.352896511554718, + "rewards/reward_fn/mean": 3.888477325439453, + "rewards/reward_fn/std": 0.352896511554718, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 292.21875, + "completions/mean_terminated_length": 292.21875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.17396838867083908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.014141359482891858, + "learning_rate": 7.3443999999999995e-06, + "loss": 0.0457, + "num_tokens": 75719382.0, + "reward": 3.928811550140381, + "reward_std": 0.4027021825313568, + "rewards/reward_fn/mean": 3.928811550140381, + "rewards/reward_fn/std": 0.4027021825313568, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 367.40625, + "completions/mean_terminated_length": 313.19354248046875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.174074466956614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0231416008900851, + "learning_rate": 7.344e-06, + "loss": 0.1287, + "num_tokens": 75788419.0, + "reward": 3.5149707794189453, + "reward_std": 1.023628830909729, + "rewards/reward_fn/mean": 3.5149707794189453, + "rewards/reward_fn/std": 1.023628830909729, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1553.0, + "completions/max_terminated_length": 1553.0, + "completions/mean_length": 384.71875, + "completions/mean_terminated_length": 384.71875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.17418054524238888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.02502290392294526, + "learning_rate": 7.3436e-06, + "loss": 0.0406, + "num_tokens": 75839354.0, + "reward": 2.790564775466919, + "reward_std": 0.03924685716629028, + "rewards/reward_fn/mean": 2.790564775466919, + "rewards/reward_fn/std": 0.0392468199133873, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 202.78125, + "completions/mean_terminated_length": 202.78125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.1742866235281638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.020574510097503662, + "learning_rate": 7.3432e-06, + "loss": 0.0457, + "num_tokens": 75881331.0, + "reward": 1.727787971496582, + "reward_std": 0.03321034833788872, + "rewards/reward_fn/mean": 1.727787971496582, + "rewards/reward_fn/std": 0.033210329711437225, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1837.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 405.25, + "completions/mean_terminated_length": 405.25, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.17439270181393868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.022814936703070998, + "learning_rate": 7.3428e-06, + "loss": 0.029, + "num_tokens": 75926363.0, + "reward": 3.639138698577881, + "reward_std": 0.545224666595459, + "rewards/reward_fn/mean": 3.639138698577881, + "rewards/reward_fn/std": 0.545224666595459, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 204.9375, + "completions/mean_terminated_length": 204.9375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1744987800997136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.01477151014842093, + "learning_rate": 7.342399999999999e-06, + "loss": 0.0006, + "num_tokens": 75985561.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 140.0625, + "completions/mean_terminated_length": 140.0625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1746048583854885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.02253897808259353, + "learning_rate": 7.341999999999999e-06, + "loss": 0.0009, + "num_tokens": 76027675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.1747109366712634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.021346024004742503, + "learning_rate": 7.341599999999999e-06, + "loss": 0.0009, + "num_tokens": 76071761.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 214.15625, + "completions/mean_terminated_length": 214.15625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1748170149570383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.013097033952362835, + "learning_rate": 7.341199999999999e-06, + "loss": 0.0005, + "num_tokens": 76115766.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1412.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 461.875, + "completions/mean_terminated_length": 461.875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.1749230932428132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.022934141103178263, + "learning_rate": 7.340799999999999e-06, + "loss": 0.0009, + "num_tokens": 76163602.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 137.71875, + "completions/mean_terminated_length": 137.71875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1750291715285881, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.026293474482372403, + "learning_rate": 7.340399999999999e-06, + "loss": -0.0782, + "num_tokens": 76195721.0, + "reward": 2.860340118408203, + "reward_std": 0.05363810062408447, + "rewards/reward_fn/mean": 2.860340118408203, + "rewards/reward_fn/std": 0.05363810807466507, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 301.78125, + "completions/mean_terminated_length": 301.78125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.175135249814363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.025250343373045325, + "learning_rate": 7.339999999999999e-06, + "loss": -0.0705, + "num_tokens": 76238914.0, + "reward": 3.3528342247009277, + "reward_std": 0.5812612771987915, + "rewards/reward_fn/mean": 3.3528342247009277, + "rewards/reward_fn/std": 0.5812612175941467, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 188.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.1752413281001379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.02199221565388143, + "learning_rate": 7.339599999999999e-06, + "loss": 0.0009, + "num_tokens": 76281878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 261.1875, + "completions/mean_terminated_length": 261.1875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1753474063859128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.025439413264393806, + "learning_rate": 7.3392e-06, + "loss": 0.0509, + "num_tokens": 76320860.0, + "reward": 2.9493305683135986, + "reward_std": 0.046215642243623734, + "rewards/reward_fn/mean": 2.9493305683135986, + "rewards/reward_fn/std": 0.04621569812297821, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 450.1875, + "completions/mean_terminated_length": 450.1875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.1754534846716877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.021173911402001977, + "learning_rate": 7.3388e-06, + "loss": 0.0012, + "num_tokens": 76376802.0, + "reward": 3.892618417739868, + "reward_std": 0.3393568694591522, + "rewards/reward_fn/mean": 3.892618417739868, + "rewards/reward_fn/std": 0.3393568992614746, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 113.84375, + "completions/mean_terminated_length": 113.84375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.1755595629574626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.030263771768659353, + "learning_rate": 7.3384e-06, + "loss": 0.0012, + "num_tokens": 76405821.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1756656412432375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02956866892054677, + "learning_rate": 7.338e-06, + "loss": -0.0694, + "num_tokens": 76456809.0, + "reward": 2.06693172454834, + "reward_std": 0.4959835708141327, + "rewards/reward_fn/mean": 2.06693172454834, + "rewards/reward_fn/std": 0.4959836006164551, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 388.28125, + "completions/mean_terminated_length": 388.28125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.1757717195290124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.02950794971548021, + "learning_rate": 7.3376e-06, + "loss": -0.0245, + "num_tokens": 76494290.0, + "reward": 3.5781798362731934, + "reward_std": 0.5563759207725525, + "rewards/reward_fn/mean": 3.5781798362731934, + "rewards/reward_fn/std": 0.5563759207725525, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 188.28125, + "completions/mean_terminated_length": 188.28125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.17587779781478732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.01824926841072738, + "learning_rate": 7.3372e-06, + "loss": 0.0007, + "num_tokens": 76539323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 235.96875, + "completions/mean_terminated_length": 235.96875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1759838761005622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.025267949560657144, + "learning_rate": 7.3368e-06, + "loss": 0.0003, + "num_tokens": 76581434.0, + "reward": 3.967522382736206, + "reward_std": 0.18372122943401337, + "rewards/reward_fn/mean": 3.967522382736206, + "rewards/reward_fn/std": 0.18372122943401337, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 199.46875, + "completions/mean_terminated_length": 199.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.17608995438633712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.02116774907335639, + "learning_rate": 7.3364e-06, + "loss": 0.1391, + "num_tokens": 76600041.0, + "reward": 2.821526288986206, + "reward_std": 0.06851638108491898, + "rewards/reward_fn/mean": 2.821526288986206, + "rewards/reward_fn/std": 0.06851641088724136, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 342.59375, + "completions/mean_terminated_length": 342.59375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.176196032672112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.02219717751722783, + "learning_rate": 7.336e-06, + "loss": -0.0077, + "num_tokens": 76649180.0, + "reward": 3.674363613128662, + "reward_std": 0.4910111427307129, + "rewards/reward_fn/mean": 3.674363613128662, + "rewards/reward_fn/std": 0.4910111427307129, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 370.125, + "completions/mean_terminated_length": 370.125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.17630211095788692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.023815440014004707, + "learning_rate": 7.3356e-06, + "loss": 0.004, + "num_tokens": 76700128.0, + "reward": 2.9858880043029785, + "reward_std": 0.08336754143238068, + "rewards/reward_fn/mean": 2.9858880043029785, + "rewards/reward_fn/std": 0.08336751163005829, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 459.6875, + "completions/mean_terminated_length": 459.6875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.17640818924366183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.01935293711721897, + "learning_rate": 7.3352e-06, + "loss": -0.2065, + "num_tokens": 76758582.0, + "reward": 3.0257115364074707, + "reward_std": 0.9167580604553223, + "rewards/reward_fn/mean": 3.0257115364074707, + "rewards/reward_fn/std": 0.9167580008506775, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 320.25, + "completions/mean_terminated_length": 320.25, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.17651426752943672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.02033980656415224, + "learning_rate": 7.3348000000000005e-06, + "loss": 0.0008, + "num_tokens": 76798270.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 496.09375, + "completions/mean_terminated_length": 496.09375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.17662034581521163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.020821964601054788, + "learning_rate": 7.3344e-06, + "loss": 0.0254, + "num_tokens": 76859617.0, + "reward": 3.4188601970672607, + "reward_std": 0.522709846496582, + "rewards/reward_fn/mean": 3.4188601970672607, + "rewards/reward_fn/std": 0.5227097868919373, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 212.25, + "completions/mean_terminated_length": 212.25, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.17672642410098652, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.027814438799396157, + "learning_rate": 7.334e-06, + "loss": 0.0636, + "num_tokens": 76905065.0, + "reward": 2.8302299976348877, + "reward_std": 0.05470141023397446, + "rewards/reward_fn/mean": 2.8302299976348877, + "rewards/reward_fn/std": 0.054701436311006546, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 355.4375, + "completions/mean_terminated_length": 355.4375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.17683250238676143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.023907755268737674, + "learning_rate": 7.3335999999999995e-06, + "loss": 0.0154, + "num_tokens": 76972471.0, + "reward": 3.9634013175964355, + "reward_std": 0.20703355967998505, + "rewards/reward_fn/mean": 3.9634013175964355, + "rewards/reward_fn/std": 0.20703357458114624, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 216.09375, + "completions/mean_terminated_length": 216.09375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.17693858067253634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.03942023706622422, + "learning_rate": 7.3331999999999995e-06, + "loss": 0.0491, + "num_tokens": 77002138.0, + "reward": 3.0410289764404297, + "reward_std": 0.03184741735458374, + "rewards/reward_fn/mean": 3.0410289764404297, + "rewards/reward_fn/std": 0.03184738755226135, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 390.9375, + "completions/mean_terminated_length": 390.9375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.17704465895831123, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.023845213698223233, + "learning_rate": 7.3327999999999995e-06, + "loss": 0.0154, + "num_tokens": 77068280.0, + "reward": 3.8936331272125244, + "reward_std": 0.4476320743560791, + "rewards/reward_fn/mean": 3.8936331272125244, + "rewards/reward_fn/std": 0.4476320147514343, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 432.8125, + "completions/mean_terminated_length": 380.70965576171875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.17715073724408614, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.019716886803507805, + "learning_rate": 7.3323999999999995e-06, + "loss": 0.2615, + "num_tokens": 77121362.0, + "reward": 2.792949676513672, + "reward_std": 0.5104948282241821, + "rewards/reward_fn/mean": 2.792949676513672, + "rewards/reward_fn/std": 0.5104948878288269, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 154.0625, + "completions/mean_terminated_length": 154.0625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.17725681552986103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.022587708896026015, + "learning_rate": 7.3319999999999994e-06, + "loss": 0.0009, + "num_tokens": 77158388.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 195.28125, + "completions/mean_terminated_length": 195.28125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.17736289381563594, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.023013029946014285, + "learning_rate": 7.331599999999999e-06, + "loss": -0.0029, + "num_tokens": 77195357.0, + "reward": 2.860653877258301, + "reward_std": 0.23699697852134705, + "rewards/reward_fn/mean": 2.860653877258301, + "rewards/reward_fn/std": 0.23699693381786346, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 163.375, + "completions/mean_terminated_length": 163.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.17746897210141085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.021371350390836596, + "learning_rate": 7.331199999999999e-06, + "loss": 0.0009, + "num_tokens": 77235081.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 304.4375, + "completions/mean_terminated_length": 304.4375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.17757505038718574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.023021605564281344, + "learning_rate": 7.330799999999999e-06, + "loss": 0.043, + "num_tokens": 77280023.0, + "reward": 2.8056857585906982, + "reward_std": 0.30699968338012695, + "rewards/reward_fn/mean": 2.8056857585906982, + "rewards/reward_fn/std": 0.30699968338012695, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 732.71875, + "completions/mean_terminated_length": 690.290283203125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.17768112867296065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.019517116714268923, + "learning_rate": 7.330399999999999e-06, + "loss": 0.1688, + "num_tokens": 77344622.0, + "reward": 2.619640588760376, + "reward_std": 0.5425283312797546, + "rewards/reward_fn/mean": 2.619640588760376, + "rewards/reward_fn/std": 0.5425283312797546, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 181.34375, + "completions/mean_terminated_length": 181.34375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.17778720695873554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.024739502929151058, + "learning_rate": 7.33e-06, + "loss": 0.001, + "num_tokens": 77377721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 100.5, + "completions/mean_terminated_length": 100.5, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.17789328524451045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.026414725929498672, + "learning_rate": 7.3296e-06, + "loss": 0.0011, + "num_tokens": 77404681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.17799936353028534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.02964741620235145, + "learning_rate": 7.3292e-06, + "loss": 0.0012, + "num_tokens": 77443233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 253.375, + "completions/mean_terminated_length": 253.375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.17810544181606025, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.02185642090626061, + "learning_rate": 7.3288e-06, + "loss": 0.0958, + "num_tokens": 77487885.0, + "reward": 3.9379520416259766, + "reward_std": 0.2442624717950821, + "rewards/reward_fn/mean": 3.9379520416259766, + "rewards/reward_fn/std": 0.2442624568939209, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1404.0, + "completions/max_terminated_length": 1404.0, + "completions/mean_length": 439.84375, + "completions/mean_terminated_length": 439.84375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.17821152010183516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.022376101464033127, + "learning_rate": 7.3284e-06, + "loss": -0.1057, + "num_tokens": 77546440.0, + "reward": 2.7930126190185547, + "reward_std": 0.36263859272003174, + "rewards/reward_fn/mean": 2.7930126190185547, + "rewards/reward_fn/std": 0.36263859272003174, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1977.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 513.40625, + "completions/mean_terminated_length": 513.40625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.17831759838761005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.03152905683964491, + "learning_rate": 7.328e-06, + "loss": -0.087, + "num_tokens": 77591349.0, + "reward": 2.629180431365967, + "reward_std": 0.29072603583335876, + "rewards/reward_fn/mean": 2.629180431365967, + "rewards/reward_fn/std": 0.29072603583335876, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 461.46875, + "completions/mean_terminated_length": 461.46875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.17842367667338496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.02138684457167983, + "learning_rate": 7.3276e-06, + "loss": 0.1139, + "num_tokens": 77646148.0, + "reward": 3.8168039321899414, + "reward_std": 0.5191534161567688, + "rewards/reward_fn/mean": 3.8168039321899414, + "rewards/reward_fn/std": 0.5191534161567688, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 226.09375, + "completions/mean_terminated_length": 226.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.17852975495915985, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.025132108945399523, + "learning_rate": 7.3272e-06, + "loss": 0.0778, + "num_tokens": 77708775.0, + "reward": 2.9388680458068848, + "reward_std": 0.24276258051395416, + "rewards/reward_fn/mean": 2.9388680458068848, + "rewards/reward_fn/std": 0.2427625209093094, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 357.28125, + "completions/mean_terminated_length": 357.28125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.17863583324493476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.018456691992469132, + "learning_rate": 7.3268e-06, + "loss": 0.0007, + "num_tokens": 77758864.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 75.53125, + "completions/mean_terminated_length": 75.53125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.17874191153070967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.010399193910416216, + "learning_rate": 7.326399999999999e-06, + "loss": 0.0004, + "num_tokens": 77786081.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 126.03125, + "completions/mean_terminated_length": 126.03125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.17884798981648456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.0217991154640913, + "learning_rate": 7.325999999999999e-06, + "loss": 0.0009, + "num_tokens": 77822306.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1886.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 581.40625, + "completions/mean_terminated_length": 581.40625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.17895406810225947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.025571937672793865, + "learning_rate": 7.325599999999999e-06, + "loss": -0.0045, + "num_tokens": 77876303.0, + "reward": 2.0563902854919434, + "reward_std": 0.5362445712089539, + "rewards/reward_fn/mean": 2.0563902854919434, + "rewards/reward_fn/std": 0.5362445712089539, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 267.875, + "completions/mean_terminated_length": 267.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.17906014638803436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.027823872631415725, + "learning_rate": 7.3252e-06, + "loss": 0.0438, + "num_tokens": 77919531.0, + "reward": 3.966395854949951, + "reward_std": 0.19009362161159515, + "rewards/reward_fn/mean": 3.966395854949951, + "rewards/reward_fn/std": 0.19009362161159515, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 307.46875, + "completions/mean_terminated_length": 307.46875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.17916622467380927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.02866238821297884, + "learning_rate": 7.3248e-06, + "loss": -0.063, + "num_tokens": 77975002.0, + "reward": 3.857930898666382, + "reward_std": 0.559036374092102, + "rewards/reward_fn/mean": 3.857930898666382, + "rewards/reward_fn/std": 0.5590363144874573, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 193.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.17927230295958418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.02129594807047397, + "learning_rate": 7.3244e-06, + "loss": 0.0009, + "num_tokens": 78026790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 240.96875, + "completions/mean_terminated_length": 240.96875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.17937838124535907, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.020590995671227574, + "learning_rate": 7.324e-06, + "loss": 0.1261, + "num_tokens": 78058405.0, + "reward": 2.8747594356536865, + "reward_std": 0.06020258367061615, + "rewards/reward_fn/mean": 2.8747594356536865, + "rewards/reward_fn/std": 0.06020255759358406, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 295.15625, + "completions/mean_terminated_length": 295.15625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.17948445953113398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.025204677367582917, + "learning_rate": 7.3236e-06, + "loss": 0.0591, + "num_tokens": 78102698.0, + "reward": 3.822904348373413, + "reward_std": 0.41878360509872437, + "rewards/reward_fn/mean": 3.822904348373413, + "rewards/reward_fn/std": 0.418783575296402, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 159.40625, + "completions/mean_terminated_length": 159.40625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.17959053781690887, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.03215904091484845, + "learning_rate": 7.3232e-06, + "loss": 0.1193, + "num_tokens": 78143831.0, + "reward": 2.951641321182251, + "reward_std": 0.02703152783215046, + "rewards/reward_fn/mean": 2.951641321182251, + "rewards/reward_fn/std": 0.02703148126602173, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 535.96875, + "completions/mean_terminated_length": 435.16668701171875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.17969661610268378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.025029084412381053, + "learning_rate": 7.3228e-06, + "loss": 0.3638, + "num_tokens": 78197942.0, + "reward": 2.569448471069336, + "reward_std": 0.7918709516525269, + "rewards/reward_fn/mean": 2.569448471069336, + "rewards/reward_fn/std": 0.7918709516525269, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 249.1875, + "completions/mean_terminated_length": 249.1875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.1798026943884587, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.028609147295355797, + "learning_rate": 7.3223999999999996e-06, + "loss": 0.2358, + "num_tokens": 78249308.0, + "reward": 3.925804615020752, + "reward_std": 0.2920267581939697, + "rewards/reward_fn/mean": 3.925804615020752, + "rewards/reward_fn/std": 0.2920267581939697, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1599.0, + "completions/max_terminated_length": 1599.0, + "completions/mean_length": 509.09375, + "completions/mean_terminated_length": 509.09375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.17990877267423358, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.01797068677842617, + "learning_rate": 7.3219999999999995e-06, + "loss": 0.0474, + "num_tokens": 78312799.0, + "reward": 2.6847097873687744, + "reward_std": 0.18323646485805511, + "rewards/reward_fn/mean": 2.6847097873687744, + "rewards/reward_fn/std": 0.18323642015457153, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 176.28125, + "completions/mean_terminated_length": 176.28125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.1800148509600085, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.026832085801288486, + "learning_rate": 7.3215999999999995e-06, + "loss": 0.1148, + "num_tokens": 78338376.0, + "reward": 2.6626052856445312, + "reward_std": 0.06326717883348465, + "rewards/reward_fn/mean": 2.6626052856445312, + "rewards/reward_fn/std": 0.06326717138290405, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 326.28125, + "completions/mean_terminated_length": 326.28125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.18012092924578338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.023031077347695827, + "learning_rate": 7.3211999999999995e-06, + "loss": 0.0221, + "num_tokens": 78392721.0, + "reward": 3.9276814460754395, + "reward_std": 0.40909603238105774, + "rewards/reward_fn/mean": 3.9276814460754395, + "rewards/reward_fn/std": 0.40909600257873535, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 219.40625, + "completions/mean_terminated_length": 219.40625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1802270075315583, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.02422755560837686, + "learning_rate": 7.3208e-06, + "loss": 0.0462, + "num_tokens": 78431102.0, + "reward": 3.896008014678955, + "reward_std": 0.3290996253490448, + "rewards/reward_fn/mean": 3.896008014678955, + "rewards/reward_fn/std": 0.3290995657444, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 383.46875, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.1803330858173332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.018242677440866828, + "learning_rate": 7.3204e-06, + "loss": 0.3524, + "num_tokens": 78500333.0, + "reward": 3.309962511062622, + "reward_std": 1.0754379034042358, + "rewards/reward_fn/mean": 3.309962511062622, + "rewards/reward_fn/std": 1.0754379034042358, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 347.96875, + "completions/mean_terminated_length": 347.96875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.1804391641031081, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.016349896206520498, + "learning_rate": 7.32e-06, + "loss": 0.1304, + "num_tokens": 78567532.0, + "reward": 3.853870153427124, + "reward_std": 0.5750675797462463, + "rewards/reward_fn/mean": 3.853870153427124, + "rewards/reward_fn/std": 0.5750675797462463, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.180545242388883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.025941829895600677, + "learning_rate": 7.3196e-06, + "loss": 0.001, + "num_tokens": 78615864.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.1806513206746579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.02063106745481491, + "learning_rate": 7.3192e-06, + "loss": 0.0008, + "num_tokens": 78639100.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 210.1875, + "completions/mean_terminated_length": 210.1875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.1807573989604328, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.024305471451953053, + "learning_rate": 7.3188e-06, + "loss": 0.0736, + "num_tokens": 78684962.0, + "reward": 3.971139907836914, + "reward_std": 0.16325806081295013, + "rewards/reward_fn/mean": 3.971139907836914, + "rewards/reward_fn/std": 0.16325809061527252, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 321.625, + "completions/mean_terminated_length": 321.625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1808634772462077, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.021618828177452087, + "learning_rate": 7.318399999999999e-06, + "loss": 0.0133, + "num_tokens": 78735606.0, + "reward": 3.964750289916992, + "reward_std": 0.1994020640850067, + "rewards/reward_fn/mean": 3.964750289916992, + "rewards/reward_fn/std": 0.19940204918384552, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 194.53125, + "completions/mean_terminated_length": 194.53125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.1809695555319826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.017645681044086814, + "learning_rate": 7.317999999999999e-06, + "loss": 0.0007, + "num_tokens": 78767687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 162.25, + "completions/mean_terminated_length": 162.25, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.18107563381775751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.021362949744798243, + "learning_rate": 7.317599999999999e-06, + "loss": 0.0009, + "num_tokens": 78815471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 240.90625, + "completions/mean_terminated_length": 240.90625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1811817121035324, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.021373329684138298, + "learning_rate": 7.317199999999999e-06, + "loss": -0.0466, + "num_tokens": 78869164.0, + "reward": 1.739349365234375, + "reward_std": 0.4158928692340851, + "rewards/reward_fn/mean": 1.739349365234375, + "rewards/reward_fn/std": 0.4158928096294403, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1127.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 254.40625, + "completions/mean_terminated_length": 254.40625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.1812877903893073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.022718794643878937, + "learning_rate": 7.316799999999999e-06, + "loss": -0.0496, + "num_tokens": 78913561.0, + "reward": 3.9045119285583496, + "reward_std": 0.30215689539909363, + "rewards/reward_fn/mean": 3.9045119285583496, + "rewards/reward_fn/std": 0.30215683579444885, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 786.8125, + "completions/mean_terminated_length": 746.1290283203125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.1813938686750822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.020097526721656322, + "learning_rate": 7.316399999999999e-06, + "loss": 0.1906, + "num_tokens": 78967091.0, + "reward": 2.3898777961730957, + "reward_std": 0.6157297492027283, + "rewards/reward_fn/mean": 2.3898777961730957, + "rewards/reward_fn/std": 0.6157297492027283, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 439.78125, + "completions/mean_terminated_length": 439.78125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1814999469608571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.025020675268024206, + "learning_rate": 7.316e-06, + "loss": 0.007, + "num_tokens": 78999244.0, + "reward": 3.8277320861816406, + "reward_std": 0.571384847164154, + "rewards/reward_fn/mean": 3.8277320861816406, + "rewards/reward_fn/std": 0.571384847164154, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 107.59375, + "completions/mean_terminated_length": 107.59375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.18160602524663202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.014298198861069977, + "learning_rate": 7.3156e-06, + "loss": 0.0006, + "num_tokens": 79037471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 489.375, + "completions/mean_terminated_length": 489.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.1817121035324069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.019865088164806366, + "learning_rate": 7.3152e-06, + "loss": -0.0399, + "num_tokens": 79087947.0, + "reward": 3.5038528442382812, + "reward_std": 0.6733208298683167, + "rewards/reward_fn/mean": 3.5038528442382812, + "rewards/reward_fn/std": 0.6733208298683167, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 738.0625, + "completions/mean_terminated_length": 650.7333374023438, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.18181818181818182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0224718798417598, + "learning_rate": 7.3148e-06, + "loss": 0.1838, + "num_tokens": 79141901.0, + "reward": 2.2625229358673096, + "reward_std": 0.8606188297271729, + "rewards/reward_fn/mean": 2.2625229358673096, + "rewards/reward_fn/std": 0.8606187105178833, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1819242601039567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.024445211980491877, + "learning_rate": 7.3144e-06, + "loss": 0.001, + "num_tokens": 79185863.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 340.28125, + "completions/mean_terminated_length": 340.28125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.18203033838973162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.02192999073304236, + "learning_rate": 7.314e-06, + "loss": -0.0481, + "num_tokens": 79243696.0, + "reward": 3.75516676902771, + "reward_std": 0.470197856426239, + "rewards/reward_fn/mean": 3.75516676902771, + "rewards/reward_fn/std": 0.470197856426239, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 412.375, + "completions/mean_terminated_length": 412.375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.18213641667550654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.017105724662542343, + "learning_rate": 7.3136e-06, + "loss": 0.0388, + "num_tokens": 79273980.0, + "reward": 3.72775936126709, + "reward_std": 0.4455103576183319, + "rewards/reward_fn/mean": 3.72775936126709, + "rewards/reward_fn/std": 0.4455103576183319, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.18224249496128142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.01594231475610286, + "learning_rate": 7.3132e-06, + "loss": 0.0312, + "num_tokens": 79324824.0, + "reward": 2.8251233100891113, + "reward_std": 0.0379062183201313, + "rewards/reward_fn/mean": 2.8251233100891113, + "rewards/reward_fn/std": 0.03790617734193802, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 228.09375, + "completions/mean_terminated_length": 228.09375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.18234857324705633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.02496536774560809, + "learning_rate": 7.3128e-06, + "loss": 0.0136, + "num_tokens": 79382683.0, + "reward": 3.88787579536438, + "reward_std": 0.4645228385925293, + "rewards/reward_fn/mean": 3.88787579536438, + "rewards/reward_fn/std": 0.4645228087902069, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 340.71875, + "completions/mean_terminated_length": 340.71875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.18245465153283122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.02118815656285733, + "learning_rate": 7.3124e-06, + "loss": 0.0761, + "num_tokens": 79439858.0, + "reward": 2.998422622680664, + "reward_std": 0.06069687008857727, + "rewards/reward_fn/mean": 2.998422622680664, + "rewards/reward_fn/std": 0.060696884989738464, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 414.15625, + "completions/mean_terminated_length": 414.15625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.18256072981860613, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.029617647174745798, + "learning_rate": 7.312e-06, + "loss": 0.0364, + "num_tokens": 79483319.0, + "reward": 2.8089561462402344, + "reward_std": 0.05686548724770546, + "rewards/reward_fn/mean": 2.8089561462402344, + "rewards/reward_fn/std": 0.05686549097299576, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 184.8125, + "completions/mean_terminated_length": 184.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.18266680810438105, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.020757826045155525, + "learning_rate": 7.3116e-06, + "loss": -0.0682, + "num_tokens": 79523601.0, + "reward": 2.8676366806030273, + "reward_std": 0.06009431183338165, + "rewards/reward_fn/mean": 2.8676366806030273, + "rewards/reward_fn/std": 0.060094304382801056, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 275.34375, + "completions/mean_terminated_length": 275.34375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.18277288639015593, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.02504241978749633, + "learning_rate": 7.3112e-06, + "loss": -0.0056, + "num_tokens": 79566236.0, + "reward": 2.6496548652648926, + "reward_std": 0.2010585367679596, + "rewards/reward_fn/mean": 2.6496548652648926, + "rewards/reward_fn/std": 0.2010585218667984, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1770.0, + "completions/max_terminated_length": 1770.0, + "completions/mean_length": 679.5, + "completions/mean_terminated_length": 679.5, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.18287896467593084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.019714355003088713, + "learning_rate": 7.3108e-06, + "loss": 0.1588, + "num_tokens": 79649772.0, + "reward": 3.8880138397216797, + "reward_std": 0.4623366594314575, + "rewards/reward_fn/mean": 3.8880138397216797, + "rewards/reward_fn/std": 0.4623366594314575, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 177.46875, + "completions/mean_terminated_length": 177.46875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.18298504296170573, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.019116016919724643, + "learning_rate": 7.3103999999999995e-06, + "loss": -0.0512, + "num_tokens": 79694267.0, + "reward": 3.8662519454956055, + "reward_std": 0.5263176560401917, + "rewards/reward_fn/mean": 3.8662519454956055, + "rewards/reward_fn/std": 0.5263176560401917, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 382.09375, + "completions/mean_terminated_length": 382.09375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.18309112124748064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.017504334566183388, + "learning_rate": 7.3099999999999995e-06, + "loss": 0.0785, + "num_tokens": 79748798.0, + "reward": 3.590195655822754, + "reward_std": 0.575702428817749, + "rewards/reward_fn/mean": 3.590195655822754, + "rewards/reward_fn/std": 0.5757024884223938, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 129.59375, + "completions/mean_terminated_length": 129.59375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.18319719953325556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.012138587655499578, + "learning_rate": 7.3095999999999994e-06, + "loss": 0.0005, + "num_tokens": 79783569.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 294.65625, + "completions/mean_terminated_length": 294.65625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.18330327781903044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.021464786026626825, + "learning_rate": 7.309199999999999e-06, + "loss": 0.0009, + "num_tokens": 79829862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 289.34375, + "completions/mean_terminated_length": 289.34375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.18340935610480535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.032017044024541974, + "learning_rate": 7.308799999999999e-06, + "loss": 0.1641, + "num_tokens": 79871569.0, + "reward": 3.642909288406372, + "reward_std": 0.6748332977294922, + "rewards/reward_fn/mean": 3.642909288406372, + "rewards/reward_fn/std": 0.6748332977294922, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 1106.5625, + "completions/mean_terminated_length": 889.3077392578125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.18351543439058024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.01772727456409484, + "learning_rate": 7.308399999999999e-06, + "loss": 0.3112, + "num_tokens": 79935683.0, + "reward": 2.0223124027252197, + "reward_std": 1.02803635597229, + "rewards/reward_fn/mean": 2.0223124027252197, + "rewards/reward_fn/std": 1.02803635597229, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 178.65625, + "completions/mean_terminated_length": 178.65625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.18362151267635515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.021037452504970133, + "learning_rate": 7.307999999999999e-06, + "loss": 0.0008, + "num_tokens": 79976280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 354.9375, + "completions/mean_terminated_length": 354.9375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.18372759096213004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.030045943101868033, + "learning_rate": 7.307599999999999e-06, + "loss": 0.0632, + "num_tokens": 80034390.0, + "reward": 3.2148056030273438, + "reward_std": 0.7227078080177307, + "rewards/reward_fn/mean": 3.2148056030273438, + "rewards/reward_fn/std": 0.7227078080177307, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 293.21875, + "completions/mean_terminated_length": 293.21875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.18383366924790495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.01632210414391011, + "learning_rate": 7.307199999999999e-06, + "loss": 0.0775, + "num_tokens": 80096061.0, + "reward": 3.0696334838867188, + "reward_std": 0.03446883708238602, + "rewards/reward_fn/mean": 3.0696334838867188, + "rewards/reward_fn/std": 0.034468866884708405, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 285.28125, + "completions/mean_terminated_length": 285.28125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.18393974753367986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.021304041147232056, + "learning_rate": 7.306799999999999e-06, + "loss": -0.0633, + "num_tokens": 80153734.0, + "reward": 3.1486945152282715, + "reward_std": 0.6498160362243652, + "rewards/reward_fn/mean": 3.1486945152282715, + "rewards/reward_fn/std": 0.6498160362243652, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 246.09375, + "completions/mean_terminated_length": 246.09375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.18404582581945475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.02485991013236344, + "learning_rate": 7.3064e-06, + "loss": -0.0315, + "num_tokens": 80200777.0, + "reward": 3.749239444732666, + "reward_std": 0.4819631576538086, + "rewards/reward_fn/mean": 3.749239444732666, + "rewards/reward_fn/std": 0.4819631576538086, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.18415190410522966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.023410562425851822, + "learning_rate": 7.306e-06, + "loss": 0.0159, + "num_tokens": 80245261.0, + "reward": 3.9407196044921875, + "reward_std": 0.2346184253692627, + "rewards/reward_fn/mean": 3.9407196044921875, + "rewards/reward_fn/std": 0.2346184253692627, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1550.0, + "completions/max_terminated_length": 1550.0, + "completions/mean_length": 397.46875, + "completions/mean_terminated_length": 397.46875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.18425798239100455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.026979970512911677, + "learning_rate": 7.3056e-06, + "loss": -0.0341, + "num_tokens": 80297532.0, + "reward": 3.124235153198242, + "reward_std": 0.7219064831733704, + "rewards/reward_fn/mean": 3.124235153198242, + "rewards/reward_fn/std": 0.7219064831733704, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.18436406067677946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.03350239293649793, + "learning_rate": 7.3052e-06, + "loss": -0.0451, + "num_tokens": 80336412.0, + "reward": 2.874659299850464, + "reward_std": 0.19675695896148682, + "rewards/reward_fn/mean": 2.874659299850464, + "rewards/reward_fn/std": 0.19675692915916443, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 400.5, + "completions/mean_terminated_length": 400.5, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.18447013896255438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.021226889453828335, + "learning_rate": 7.3048e-06, + "loss": 0.0089, + "num_tokens": 80377772.0, + "reward": 2.802847385406494, + "reward_std": 0.04565891623497009, + "rewards/reward_fn/mean": 2.802847385406494, + "rewards/reward_fn/std": 0.045658860355615616, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 396.5, + "completions/mean_terminated_length": 396.5, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.18457621724832926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.02231617399957031, + "learning_rate": 7.3044e-06, + "loss": -0.1365, + "num_tokens": 80411356.0, + "reward": 3.0051422119140625, + "reward_std": 0.4885551333427429, + "rewards/reward_fn/mean": 3.0051422119140625, + "rewards/reward_fn/std": 0.4885551631450653, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 328.15625, + "completions/mean_terminated_length": 328.15625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.18468229553410417, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02340529253706336, + "learning_rate": 7.304e-06, + "loss": 0.0033, + "num_tokens": 80457441.0, + "reward": 3.927124261856079, + "reward_std": 0.41224780678749084, + "rewards/reward_fn/mean": 3.927124261856079, + "rewards/reward_fn/std": 0.41224780678749084, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 397.21875, + "completions/mean_terminated_length": 397.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.18478837381987906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.024748707422986627, + "learning_rate": 7.3036e-06, + "loss": 0.0071, + "num_tokens": 80493704.0, + "reward": 2.719790458679199, + "reward_std": 0.18041294813156128, + "rewards/reward_fn/mean": 2.719790458679199, + "rewards/reward_fn/std": 0.18041293323040009, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 198.84375, + "completions/mean_terminated_length": 198.84375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.18489445210565397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.025478441501036286, + "learning_rate": 7.3032e-06, + "loss": 0.001, + "num_tokens": 80528579.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 317.625, + "completions/mean_terminated_length": 317.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.18500053039142889, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.023378330282866955, + "learning_rate": 7.3028e-06, + "loss": -0.0361, + "num_tokens": 80576375.0, + "reward": 3.9293417930603027, + "reward_std": 0.27804508805274963, + "rewards/reward_fn/mean": 3.9293417930603027, + "rewards/reward_fn/std": 0.278045117855072, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 316.71875, + "completions/mean_terminated_length": 316.71875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.18510660867720377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.015432185959070921, + "learning_rate": 7.302399999999999e-06, + "loss": 0.0778, + "num_tokens": 80629230.0, + "reward": 2.648890972137451, + "reward_std": 0.05023077875375748, + "rewards/reward_fn/mean": 2.648890972137451, + "rewards/reward_fn/std": 0.050230756402015686, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 192.3125, + "completions/mean_terminated_length": 192.3125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.18521268696297868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.028556083096191287, + "learning_rate": 7.301999999999999e-06, + "loss": 0.0011, + "num_tokens": 80658936.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 510.15625, + "completions/mean_terminated_length": 510.15625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.18531876524875357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.02161628007888794, + "learning_rate": 7.3016e-06, + "loss": 0.0176, + "num_tokens": 80715997.0, + "reward": 2.8921079635620117, + "reward_std": 0.3677787184715271, + "rewards/reward_fn/mean": 2.8921079635620117, + "rewards/reward_fn/std": 0.3677787482738495, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 191.84375, + "completions/mean_terminated_length": 191.84375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.18542484353452848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.029479774879291654, + "learning_rate": 7.3012e-06, + "loss": -0.0478, + "num_tokens": 80761304.0, + "reward": 3.9663615226745605, + "reward_std": 0.19028803706169128, + "rewards/reward_fn/mean": 3.9663615226745605, + "rewards/reward_fn/std": 0.19028803706169128, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 340.59375, + "completions/mean_terminated_length": 340.59375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1855309218203034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.025199016323313117, + "learning_rate": 7.3008e-06, + "loss": 0.001, + "num_tokens": 80808139.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 221.4375, + "completions/mean_terminated_length": 221.4375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.18563700010607828, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.02979375934228301, + "learning_rate": 7.3004e-06, + "loss": 0.1565, + "num_tokens": 80865401.0, + "reward": 3.743729591369629, + "reward_std": 0.6451296210289001, + "rewards/reward_fn/mean": 3.743729591369629, + "rewards/reward_fn/std": 0.6451296210289001, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1857430783918532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.04355457844212651, + "learning_rate": 7.2999999999999996e-06, + "loss": -0.0124, + "num_tokens": 80906489.0, + "reward": 3.9675936698913574, + "reward_std": 0.18331791460514069, + "rewards/reward_fn/mean": 3.9675936698913574, + "rewards/reward_fn/std": 0.18331791460514069, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 537.4375, + "completions/mean_terminated_length": 537.4375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.18584915667762808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.01735016261227429, + "learning_rate": 7.2995999999999995e-06, + "loss": 0.0577, + "num_tokens": 80971911.0, + "reward": 2.747972249984741, + "reward_std": 0.04789024218916893, + "rewards/reward_fn/mean": 2.747972249984741, + "rewards/reward_fn/std": 0.04789023473858833, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 360.625, + "completions/mean_terminated_length": 360.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.185955234963403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.016449951217509806, + "learning_rate": 7.2991999999999995e-06, + "loss": 0.0972, + "num_tokens": 81028475.0, + "reward": 3.906682252883911, + "reward_std": 0.29498347640037537, + "rewards/reward_fn/mean": 3.906682252883911, + "rewards/reward_fn/std": 0.29498350620269775, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1860613132491779, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.024210747331380844, + "learning_rate": 7.2987999999999995e-06, + "loss": 0.044, + "num_tokens": 81070289.0, + "reward": 2.903296947479248, + "reward_std": 0.4796822965145111, + "rewards/reward_fn/mean": 2.903296947479248, + "rewards/reward_fn/std": 0.4796823263168335, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 1133.0, + "completions/mean_length": 335.09375, + "completions/mean_terminated_length": 335.09375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1861673915349528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.023606545757502317, + "learning_rate": 7.2983999999999994e-06, + "loss": 0.0337, + "num_tokens": 81119316.0, + "reward": 3.511998176574707, + "reward_std": 0.6018990278244019, + "rewards/reward_fn/mean": 3.511998176574707, + "rewards/reward_fn/std": 0.6018990278244019, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 1084.625, + "completions/mean_terminated_length": 1020.4000244140625, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.1862734698207277, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.01310090278275311, + "learning_rate": 7.297999999999999e-06, + "loss": 0.1005, + "num_tokens": 81194376.0, + "reward": 3.3202013969421387, + "reward_std": 1.109673261642456, + "rewards/reward_fn/mean": 3.3202013969421387, + "rewards/reward_fn/std": 1.109673261642456, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1863795481065026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.02232472668401897, + "learning_rate": 7.297599999999999e-06, + "loss": -0.0044, + "num_tokens": 81246823.0, + "reward": 2.9757986068725586, + "reward_std": 0.21454162895679474, + "rewards/reward_fn/mean": 2.9757986068725586, + "rewards/reward_fn/std": 0.21454162895679474, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 238.03125, + "completions/mean_terminated_length": 238.03125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1864856263922775, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.02410236350260675, + "learning_rate": 7.2972e-06, + "loss": 0.0092, + "num_tokens": 81283624.0, + "reward": 3.3134825229644775, + "reward_std": 0.43799471855163574, + "rewards/reward_fn/mean": 3.3134825229644775, + "rewards/reward_fn/std": 0.43799474835395813, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 126.21875, + "completions/mean_terminated_length": 126.21875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.1865917046780524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.019084823317825794, + "learning_rate": 7.2968e-06, + "loss": 0.1121, + "num_tokens": 81319631.0, + "reward": 2.9211864471435547, + "reward_std": 0.029545731842517853, + "rewards/reward_fn/mean": 2.9211864471435547, + "rewards/reward_fn/std": 0.0295457411557436, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1866977829638273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.02463018544949591, + "learning_rate": 7.2964e-06, + "loss": 0.1919, + "num_tokens": 81365325.0, + "reward": 3.0176355838775635, + "reward_std": 0.3355969190597534, + "rewards/reward_fn/mean": 3.0176355838775635, + "rewards/reward_fn/std": 0.3355969786643982, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 180.40625, + "completions/mean_terminated_length": 180.40625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.18680386124960222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.020807773573324084, + "learning_rate": 7.296e-06, + "loss": -0.0111, + "num_tokens": 81410778.0, + "reward": 3.9669175148010254, + "reward_std": 0.18714292347431183, + "rewards/reward_fn/mean": 3.9669175148010254, + "rewards/reward_fn/std": 0.18714292347431183, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 344.90625, + "completions/mean_terminated_length": 344.90625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.1869099395353771, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.024345112266018987, + "learning_rate": 7.2956e-06, + "loss": 0.0946, + "num_tokens": 81475991.0, + "reward": 3.33730149269104, + "reward_std": 0.9492329955101013, + "rewards/reward_fn/mean": 3.33730149269104, + "rewards/reward_fn/std": 0.9492329955101013, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 295.0, + "completions/mean_terminated_length": 295.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.18701601782115201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.023923526634462178, + "learning_rate": 7.2952e-06, + "loss": 0.001, + "num_tokens": 81534167.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1871220961069269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.03396447142586112, + "learning_rate": 7.2948e-06, + "loss": 0.0014, + "num_tokens": 81580951.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1609.0, + "completions/max_terminated_length": 1609.0, + "completions/mean_length": 445.4375, + "completions/mean_terminated_length": 445.4375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1872281743927018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.021805549738928676, + "learning_rate": 7.2944e-06, + "loss": 0.0994, + "num_tokens": 81633669.0, + "reward": 2.8578386306762695, + "reward_std": 0.36610347032546997, + "rewards/reward_fn/mean": 2.8578386306762695, + "rewards/reward_fn/std": 0.36610347032546997, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 206.96875, + "completions/mean_terminated_length": 206.96875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.18733425267847673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.0180403123376891, + "learning_rate": 7.293999999999999e-06, + "loss": 0.0031, + "num_tokens": 81672036.0, + "reward": 2.8211755752563477, + "reward_std": 0.21864524483680725, + "rewards/reward_fn/mean": 2.8211755752563477, + "rewards/reward_fn/std": 0.21864525973796844, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 330.3125, + "completions/mean_terminated_length": 330.3125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1874403309642516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.019215510925278068, + "learning_rate": 7.293599999999999e-06, + "loss": 0.0027, + "num_tokens": 81722926.0, + "reward": 3.3021492958068848, + "reward_std": 0.44588205218315125, + "rewards/reward_fn/mean": 3.3021492958068848, + "rewards/reward_fn/std": 0.44588202238082886, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 301.84375, + "completions/mean_terminated_length": 301.84375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.18754640925002652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.02523637586273253, + "learning_rate": 7.293199999999999e-06, + "loss": -0.0158, + "num_tokens": 81764425.0, + "reward": 2.5516433715820312, + "reward_std": 0.436400830745697, + "rewards/reward_fn/mean": 2.5516433715820312, + "rewards/reward_fn/std": 0.43640080094337463, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 237.59375, + "completions/mean_terminated_length": 237.59375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1876524875358014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.032501579727977514, + "learning_rate": 7.292799999999999e-06, + "loss": 0.0013, + "num_tokens": 81807548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 513.40625, + "completions/mean_terminated_length": 463.9031982421875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.18775856582157632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.026550387497991323, + "learning_rate": 7.2924e-06, + "loss": 0.1681, + "num_tokens": 81863881.0, + "reward": 2.7122297286987305, + "reward_std": 0.6148126721382141, + "rewards/reward_fn/mean": 2.7122297286987305, + "rewards/reward_fn/std": 0.6148126721382141, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 288.84375, + "completions/mean_terminated_length": 288.84375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.18786464410735124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.026848802575841546, + "learning_rate": 7.292e-06, + "loss": 0.0011, + "num_tokens": 81909060.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 228.34375, + "completions/mean_terminated_length": 228.34375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.18797072239312612, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.033349854638800025, + "learning_rate": 7.2916e-06, + "loss": 0.0913, + "num_tokens": 81959279.0, + "reward": 3.149571180343628, + "reward_std": 0.5094886422157288, + "rewards/reward_fn/mean": 3.149571180343628, + "rewards/reward_fn/std": 0.509488582611084, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.18807680067890103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.02406395087018609, + "learning_rate": 7.2912e-06, + "loss": 0.001, + "num_tokens": 81996223.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 579.75, + "completions/mean_terminated_length": 481.86669921875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.18818287896467592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.03457389399409294, + "learning_rate": 7.2908e-06, + "loss": 0.274, + "num_tokens": 82045591.0, + "reward": 2.5302791595458984, + "reward_std": 0.7885620594024658, + "rewards/reward_fn/mean": 2.5302791595458984, + "rewards/reward_fn/std": 0.7885620594024658, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1887.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 469.25, + "completions/mean_terminated_length": 469.25, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.18828895725045083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.029545899014919996, + "learning_rate": 7.2904e-06, + "loss": 0.1866, + "num_tokens": 82123359.0, + "reward": 3.565403699874878, + "reward_std": 0.9191962480545044, + "rewards/reward_fn/mean": 3.565403699874878, + "rewards/reward_fn/std": 0.9191962480545044, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 164.96875, + "completions/mean_terminated_length": 164.96875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.18839503553622575, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.02256469742860645, + "learning_rate": 7.29e-06, + "loss": 0.0788, + "num_tokens": 82165246.0, + "reward": 3.9026288986206055, + "reward_std": 0.3076043426990509, + "rewards/reward_fn/mean": 3.9026288986206055, + "rewards/reward_fn/std": 0.3076043725013733, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 214.96875, + "completions/mean_terminated_length": 214.96875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.18850111382200063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.02400966896675527, + "learning_rate": 7.2896e-06, + "loss": 0.001, + "num_tokens": 82206333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 290.875, + "completions/mean_terminated_length": 290.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.18860719210777555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.021961679798550904, + "learning_rate": 7.2892e-06, + "loss": 0.0009, + "num_tokens": 82255705.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 321.21875, + "completions/mean_terminated_length": 321.21875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.18871327039355043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.02163318544626236, + "learning_rate": 7.2887999999999996e-06, + "loss": 0.0009, + "num_tokens": 82303200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 205.6875, + "completions/mean_terminated_length": 205.6875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.18881934867932534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.020103381713852286, + "learning_rate": 7.2883999999999995e-06, + "loss": 0.0008, + "num_tokens": 82342710.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 294.5625, + "completions/mean_terminated_length": 294.5625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.18892542696510026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.023567187832668424, + "learning_rate": 7.2879999999999995e-06, + "loss": 0.0174, + "num_tokens": 82395080.0, + "reward": 3.1176083087921143, + "reward_std": 0.5186969041824341, + "rewards/reward_fn/mean": 3.1176083087921143, + "rewards/reward_fn/std": 0.5186969041824341, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 316.5625, + "completions/mean_terminated_length": 316.5625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.18903150525087514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.024137710686773062, + "learning_rate": 7.2876e-06, + "loss": -0.0694, + "num_tokens": 82450394.0, + "reward": 2.7653896808624268, + "reward_std": 0.0928201898932457, + "rewards/reward_fn/mean": 2.7653896808624268, + "rewards/reward_fn/std": 0.09282021969556808, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 410.03125, + "completions/mean_terminated_length": 410.03125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.18913758353665006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.037990636890754104, + "learning_rate": 7.2872e-06, + "loss": 0.023, + "num_tokens": 82499099.0, + "reward": 2.971921443939209, + "reward_std": 0.3378731310367584, + "rewards/reward_fn/mean": 2.971921443939209, + "rewards/reward_fn/std": 0.3378731608390808, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 211.96875, + "completions/mean_terminated_length": 211.96875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.18924366182242494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.025865831412374973, + "learning_rate": 7.2868e-06, + "loss": -0.0036, + "num_tokens": 82542042.0, + "reward": 3.0052905082702637, + "reward_std": 0.05251338332891464, + "rewards/reward_fn/mean": 3.0052905082702637, + "rewards/reward_fn/std": 0.05251337215304375, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1905.0, + "completions/max_terminated_length": 1905.0, + "completions/mean_length": 417.40625, + "completions/mean_terminated_length": 417.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.18934974010819985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.021670245798304677, + "learning_rate": 7.2864e-06, + "loss": 0.141, + "num_tokens": 82600455.0, + "reward": 2.866835594177246, + "reward_std": 0.07305392622947693, + "rewards/reward_fn/mean": 2.866835594177246, + "rewards/reward_fn/std": 0.07305389642715454, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 151.4375, + "completions/mean_terminated_length": 151.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.18945581839397474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.020182526553981006, + "learning_rate": 7.285999999999999e-06, + "loss": 0.0008, + "num_tokens": 82641301.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 857.53125, + "completions/mean_terminated_length": 734.3793334960938, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.18956189667974965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.018903368851169944, + "learning_rate": 7.285599999999999e-06, + "loss": 0.1813, + "num_tokens": 82711942.0, + "reward": 2.3174896240234375, + "reward_std": 0.9160798192024231, + "rewards/reward_fn/mean": 2.3174896240234375, + "rewards/reward_fn/std": 0.9160798192024231, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1586.0, + "completions/mean_length": 443.40625, + "completions/mean_terminated_length": 391.6451416015625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.18966797496552457, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.029990455135703087, + "learning_rate": 7.285199999999999e-06, + "loss": 0.2298, + "num_tokens": 82789395.0, + "reward": 2.748314380645752, + "reward_std": 0.512526273727417, + "rewards/reward_fn/mean": 2.748314380645752, + "rewards/reward_fn/std": 0.512526273727417, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 315.90625, + "completions/mean_terminated_length": 315.90625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.18977405325129945, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.01661941665224731, + "learning_rate": 7.284799999999999e-06, + "loss": -0.0135, + "num_tokens": 82847696.0, + "reward": 3.9684324264526367, + "reward_std": 0.1785728633403778, + "rewards/reward_fn/mean": 3.9684324264526367, + "rewards/reward_fn/std": 0.1785728633403778, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 163.0625, + "completions/mean_terminated_length": 163.0625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.18988013153707436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.016163927502930164, + "learning_rate": 7.284399999999999e-06, + "loss": 0.0006, + "num_tokens": 82904786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 196.90625, + "completions/mean_terminated_length": 196.90625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.18998620982284925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.021043762797489762, + "learning_rate": 7.283999999999999e-06, + "loss": 0.0344, + "num_tokens": 82945903.0, + "reward": 2.891822338104248, + "reward_std": 0.2067325860261917, + "rewards/reward_fn/mean": 2.891822338104248, + "rewards/reward_fn/std": 0.20673255622386932, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 138.78125, + "completions/mean_terminated_length": 138.78125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.19009228810862416, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.042682474479079247, + "learning_rate": 7.283599999999999e-06, + "loss": 0.0077, + "num_tokens": 82982600.0, + "reward": 3.008596181869507, + "reward_std": 0.0607355572283268, + "rewards/reward_fn/mean": 3.008596181869507, + "rewards/reward_fn/std": 0.060735564678907394, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 429.90625, + "completions/mean_terminated_length": 429.90625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.19019836639439908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.021921277744695544, + "learning_rate": 7.283199999999999e-06, + "loss": 0.1219, + "num_tokens": 83020037.0, + "reward": 2.7947468757629395, + "reward_std": 0.028822239488363266, + "rewards/reward_fn/mean": 2.7947468757629395, + "rewards/reward_fn/std": 0.02882222831249237, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1693.0, + "completions/max_terminated_length": 1693.0, + "completions/mean_length": 565.5, + "completions/mean_terminated_length": 565.5, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.19030444468017396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.021824662806466222, + "learning_rate": 7.2828e-06, + "loss": 0.208, + "num_tokens": 83060469.0, + "reward": 3.3677079677581787, + "reward_std": 0.8257449865341187, + "rewards/reward_fn/mean": 3.3677079677581787, + "rewards/reward_fn/std": 0.8257449269294739, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.19041052296594888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.02564499992877245, + "learning_rate": 7.2824e-06, + "loss": -0.0528, + "num_tokens": 83100443.0, + "reward": 3.9307758808135986, + "reward_std": 0.39159107208251953, + "rewards/reward_fn/mean": 3.9307758808135986, + "rewards/reward_fn/std": 0.39159104228019714, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 286.53125, + "completions/mean_terminated_length": 286.53125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.19051660125172376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.038170166313648224, + "learning_rate": 7.282e-06, + "loss": 0.0912, + "num_tokens": 83139020.0, + "reward": 3.124636650085449, + "reward_std": 0.257290244102478, + "rewards/reward_fn/mean": 3.124636650085449, + "rewards/reward_fn/std": 0.25729018449783325, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 335.84375, + "completions/mean_terminated_length": 335.84375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.19062267953749867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.026702363276854157, + "learning_rate": 7.2816e-06, + "loss": 0.0011, + "num_tokens": 83180263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 174.8125, + "completions/mean_terminated_length": 174.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1907287578232736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.021874910918995738, + "learning_rate": 7.2812e-06, + "loss": 0.0139, + "num_tokens": 83227905.0, + "reward": 3.0870790481567383, + "reward_std": 0.4484405815601349, + "rewards/reward_fn/mean": 3.0870790481567383, + "rewards/reward_fn/std": 0.4484405815601349, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 280.53125, + "completions/mean_terminated_length": 280.53125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.19083483610904847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.022913979832082987, + "learning_rate": 7.2808e-06, + "loss": 0.0525, + "num_tokens": 83266994.0, + "reward": 3.892354965209961, + "reward_std": 0.444100558757782, + "rewards/reward_fn/mean": 3.892354965209961, + "rewards/reward_fn/std": 0.4441005289554596, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 296.0625, + "completions/mean_terminated_length": 296.0625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.19094091439482339, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.021208511432632804, + "learning_rate": 7.2804e-06, + "loss": -0.0788, + "num_tokens": 83294420.0, + "reward": 3.3324835300445557, + "reward_std": 0.3951815366744995, + "rewards/reward_fn/mean": 3.3324835300445557, + "rewards/reward_fn/std": 0.3951815366744995, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 343.90625, + "completions/mean_terminated_length": 343.90625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.19104699268059827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.020625728298909962, + "learning_rate": 7.28e-06, + "loss": -0.0446, + "num_tokens": 83343409.0, + "reward": 3.7953171730041504, + "reward_std": 0.5966523885726929, + "rewards/reward_fn/mean": 3.7953171730041504, + "rewards/reward_fn/std": 0.5966523289680481, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 147.21875, + "completions/mean_terminated_length": 147.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.19115307096637318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.01696223858743906, + "learning_rate": 7.2796e-06, + "loss": 0.0007, + "num_tokens": 83383480.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 211.21875, + "completions/mean_terminated_length": 211.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.1912591492521481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.02501624054275453, + "learning_rate": 7.2792e-06, + "loss": 0.001, + "num_tokens": 83426655.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 241.34375, + "completions/mean_terminated_length": 241.34375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.19136522753792298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.024438521591946483, + "learning_rate": 7.2788e-06, + "loss": 0.001, + "num_tokens": 83454922.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 107.0625, + "completions/mean_terminated_length": 107.0625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1914713058236979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.01948395639192313, + "learning_rate": 7.2784000000000005e-06, + "loss": 0.0008, + "num_tokens": 83491020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 381.8125, + "completions/mean_terminated_length": 381.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.19157738410947278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.025701815960928798, + "learning_rate": 7.278e-06, + "loss": -0.0646, + "num_tokens": 83524998.0, + "reward": 3.9303698539733887, + "reward_std": 0.2739916145801544, + "rewards/reward_fn/mean": 3.9303698539733887, + "rewards/reward_fn/std": 0.2739916443824768, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 215.03125, + "completions/mean_terminated_length": 215.03125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1916834623952477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.0199962422484532, + "learning_rate": 7.2775999999999996e-06, + "loss": 0.0008, + "num_tokens": 83564807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1917895406810226, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02291672769933939, + "learning_rate": 7.2771999999999995e-06, + "loss": -0.0238, + "num_tokens": 83606563.0, + "reward": 3.8847744464874268, + "reward_std": 0.31120461225509644, + "rewards/reward_fn/mean": 3.8847744464874268, + "rewards/reward_fn/std": 0.31120461225509644, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 68.5, + "completions/mean_terminated_length": 68.5, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.1918956189667975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.029097398975864053, + "learning_rate": 7.2767999999999995e-06, + "loss": 0.0012, + "num_tokens": 83639059.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 261.84375, + "completions/mean_terminated_length": 261.84375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.1920016972525724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02024500584229827, + "learning_rate": 7.2763999999999995e-06, + "loss": 0.0566, + "num_tokens": 83690190.0, + "reward": 3.8510398864746094, + "reward_std": 0.40058434009552, + "rewards/reward_fn/mean": 3.8510398864746094, + "rewards/reward_fn/std": 0.4005843698978424, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 261.5, + "completions/mean_terminated_length": 261.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.1921077755383473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.02715472411364317, + "learning_rate": 7.2759999999999995e-06, + "loss": -0.0709, + "num_tokens": 83734110.0, + "reward": 3.7524542808532715, + "reward_std": 0.5632277131080627, + "rewards/reward_fn/mean": 3.7524542808532715, + "rewards/reward_fn/std": 0.5632277131080627, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 153.375, + "completions/mean_terminated_length": 153.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.1922138538241222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.014898709952831268, + "learning_rate": 7.2755999999999994e-06, + "loss": 0.0006, + "num_tokens": 83774794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 536.3125, + "completions/mean_terminated_length": 435.5333557128906, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.1923199321098971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.02957181097008288, + "learning_rate": 7.275199999999999e-06, + "loss": 0.2706, + "num_tokens": 83851540.0, + "reward": 3.250636100769043, + "reward_std": 1.182026982307434, + "rewards/reward_fn/mean": 3.250636100769043, + "rewards/reward_fn/std": 1.182026982307434, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 369.59375, + "completions/mean_terminated_length": 369.59375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.192426010395672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.02410293696448207, + "learning_rate": 7.274799999999999e-06, + "loss": -0.0188, + "num_tokens": 83894567.0, + "reward": 3.713106870651245, + "reward_std": 0.7775752544403076, + "rewards/reward_fn/mean": 3.713106870651245, + "rewards/reward_fn/std": 0.7775752544403076, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 406.15625, + "completions/mean_terminated_length": 406.15625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.19253208868144692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.02121006278321147, + "learning_rate": 7.274399999999999e-06, + "loss": 0.1567, + "num_tokens": 83928780.0, + "reward": 2.624467611312866, + "reward_std": 0.3308447301387787, + "rewards/reward_fn/mean": 2.624467611312866, + "rewards/reward_fn/std": 0.3308447301387787, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 209.875, + "completions/mean_terminated_length": 209.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1926381669672218, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.02947010798379779, + "learning_rate": 7.273999999999999e-06, + "loss": 0.0666, + "num_tokens": 83981352.0, + "reward": 3.722154140472412, + "reward_std": 0.3915632367134094, + "rewards/reward_fn/mean": 3.722154140472412, + "rewards/reward_fn/std": 0.3915632665157318, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 360.75, + "completions/mean_terminated_length": 360.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.19274424525299672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.022145352559164166, + "learning_rate": 7.2736e-06, + "loss": 0.0137, + "num_tokens": 84012608.0, + "reward": 3.703070640563965, + "reward_std": 0.5241351127624512, + "rewards/reward_fn/mean": 3.703070640563965, + "rewards/reward_fn/std": 0.5241351127624512, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1276.0, + "completions/max_terminated_length": 1276.0, + "completions/mean_length": 393.71875, + "completions/mean_terminated_length": 393.71875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.1928503235387716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0181597942719236, + "learning_rate": 7.2732e-06, + "loss": -0.049, + "num_tokens": 84062103.0, + "reward": 3.3613734245300293, + "reward_std": 0.6499727368354797, + "rewards/reward_fn/mean": 3.3613734245300293, + "rewards/reward_fn/std": 0.6499727964401245, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 246.6875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.19295640182454651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.027381795225664973, + "learning_rate": 7.2728e-06, + "loss": 0.0011, + "num_tokens": 84111693.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.19306248011032143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.021608802489936352, + "learning_rate": 7.2724e-06, + "loss": 0.0009, + "num_tokens": 84154113.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 409.59375, + "completions/mean_terminated_length": 356.7419128417969, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1931685583960963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.025535868713632226, + "learning_rate": 7.272e-06, + "loss": 0.294, + "num_tokens": 84195860.0, + "reward": 2.834456443786621, + "reward_std": 0.5542919635772705, + "rewards/reward_fn/mean": 2.834456443786621, + "rewards/reward_fn/std": 0.5542919635772705, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.19327463668187123, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.015965044614858925, + "learning_rate": 7.2716e-06, + "loss": 0.0429, + "num_tokens": 84235664.0, + "reward": 3.87819242477417, + "reward_std": 0.4338008165359497, + "rewards/reward_fn/mean": 3.87819242477417, + "rewards/reward_fn/std": 0.4338007867336273, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 215.90625, + "completions/mean_terminated_length": 215.90625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1933807149676461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.020084471092559397, + "learning_rate": 7.2712e-06, + "loss": 0.0008, + "num_tokens": 84294797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 350.53125, + "completions/mean_terminated_length": 350.53125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.19348679325342102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.022387655219063163, + "learning_rate": 7.2708e-06, + "loss": 0.1434, + "num_tokens": 84348126.0, + "reward": 3.8110899925231934, + "reward_std": 0.44719943404197693, + "rewards/reward_fn/mean": 3.8110899925231934, + "rewards/reward_fn/std": 0.44719937443733215, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1687.0, + "completions/mean_length": 395.15625, + "completions/mean_terminated_length": 341.8387145996094, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.19359287153919594, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.02443688898347318, + "learning_rate": 7.2704e-06, + "loss": 0.3221, + "num_tokens": 84409091.0, + "reward": 2.8401429653167725, + "reward_std": 0.5243302583694458, + "rewards/reward_fn/mean": 2.8401429653167725, + "rewards/reward_fn/std": 0.5243302583694458, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 218.15625, + "completions/mean_terminated_length": 218.15625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.19369894982497082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.024313464295119047, + "learning_rate": 7.269999999999999e-06, + "loss": -0.1331, + "num_tokens": 84437768.0, + "reward": 3.7873826026916504, + "reward_std": 0.6716558337211609, + "rewards/reward_fn/mean": 3.7873826026916504, + "rewards/reward_fn/std": 0.6716558337211609, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 493.40625, + "completions/mean_terminated_length": 493.40625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.19380502811074574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.020324668614193797, + "learning_rate": 7.269599999999999e-06, + "loss": -0.0033, + "num_tokens": 84492021.0, + "reward": 3.0298476219177246, + "reward_std": 0.3211779296398163, + "rewards/reward_fn/mean": 3.0298476219177246, + "rewards/reward_fn/std": 0.3211778700351715, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1439.0, + "completions/mean_length": 569.59375, + "completions/mean_terminated_length": 471.0333557128906, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.19391110639652062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.022751206997781992, + "learning_rate": 7.269199999999999e-06, + "loss": 0.3061, + "num_tokens": 84543784.0, + "reward": 3.6765613555908203, + "reward_std": 1.0499624013900757, + "rewards/reward_fn/mean": 3.6765613555908203, + "rewards/reward_fn/std": 1.0499624013900757, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 322.9375, + "completions/mean_terminated_length": 322.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.19401718468229553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.028194759273901582, + "learning_rate": 7.2688e-06, + "loss": 0.0011, + "num_tokens": 84589094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 136.96875, + "completions/mean_terminated_length": 136.96875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.19412326296807045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.022485103458166122, + "learning_rate": 7.2684e-06, + "loss": 0.0009, + "num_tokens": 84616997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 452.59375, + "completions/mean_terminated_length": 452.59375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.19422934125384533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.024805004009976983, + "learning_rate": 7.268e-06, + "loss": 0.0591, + "num_tokens": 84664056.0, + "reward": 2.991624355316162, + "reward_std": 0.3909732401371002, + "rewards/reward_fn/mean": 2.991624355316162, + "rewards/reward_fn/std": 0.3909732699394226, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.19433541953962025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.030655317706987262, + "learning_rate": 7.2676e-06, + "loss": 0.0186, + "num_tokens": 84705997.0, + "reward": 3.4060113430023193, + "reward_std": 0.6430683135986328, + "rewards/reward_fn/mean": 3.4060113430023193, + "rewards/reward_fn/std": 0.643068253993988, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.19444149782539513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.027888069627806544, + "learning_rate": 7.2672e-06, + "loss": -0.0243, + "num_tokens": 84754777.0, + "reward": 3.6079330444335938, + "reward_std": 0.5912787914276123, + "rewards/reward_fn/mean": 3.6079330444335938, + "rewards/reward_fn/std": 0.5912788510322571, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 343.625, + "completions/mean_terminated_length": 343.625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.19454757611117005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.027522264514118433, + "learning_rate": 7.2668e-06, + "loss": 0.2485, + "num_tokens": 84806573.0, + "reward": 3.0881223678588867, + "reward_std": 0.046560484915971756, + "rewards/reward_fn/mean": 3.0881223678588867, + "rewards/reward_fn/std": 0.04656045511364937, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1224.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.19465365439694496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.0238818796351552, + "learning_rate": 7.2664e-06, + "loss": 0.001, + "num_tokens": 84847717.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 346.59375, + "completions/mean_terminated_length": 346.59375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.19475973268271984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.0209172077011317, + "learning_rate": 7.2659999999999996e-06, + "loss": 0.2055, + "num_tokens": 84896376.0, + "reward": 2.8978328704833984, + "reward_std": 0.07092181593179703, + "rewards/reward_fn/mean": 2.8978328704833984, + "rewards/reward_fn/std": 0.07092180103063583, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 639.03125, + "completions/mean_terminated_length": 639.03125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.19486581096849476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.02236326946876943, + "learning_rate": 7.2655999999999995e-06, + "loss": 0.0086, + "num_tokens": 84934841.0, + "reward": 2.4415290355682373, + "reward_std": 0.6754915714263916, + "rewards/reward_fn/mean": 2.4415290355682373, + "rewards/reward_fn/std": 0.6754916310310364, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 352.0, + "completions/mean_terminated_length": 352.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.19497188925426964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.022676357068121433, + "learning_rate": 7.2651999999999995e-06, + "loss": 0.0294, + "num_tokens": 84990361.0, + "reward": 3.776266574859619, + "reward_std": 0.7067487239837646, + "rewards/reward_fn/mean": 3.776266574859619, + "rewards/reward_fn/std": 0.7067488431930542, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 185.21875, + "completions/mean_terminated_length": 185.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.19507796754004456, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.018021252821199596, + "learning_rate": 7.2647999999999995e-06, + "loss": 0.0134, + "num_tokens": 85050912.0, + "reward": 3.879213571548462, + "reward_std": 0.3250885605812073, + "rewards/reward_fn/mean": 3.879213571548462, + "rewards/reward_fn/std": 0.3250885307788849, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1688.0, + "completions/max_terminated_length": 1688.0, + "completions/mean_length": 507.25, + "completions/mean_terminated_length": 507.25, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.19518404582581944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.026730315992608666, + "learning_rate": 7.2644e-06, + "loss": 0.1458, + "num_tokens": 85098440.0, + "reward": 2.9435012340545654, + "reward_std": 0.4082188308238983, + "rewards/reward_fn/mean": 2.9435012340545654, + "rewards/reward_fn/std": 0.4082188010215759, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.19529012411159435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.025480588898062706, + "learning_rate": 7.264e-06, + "loss": 0.1826, + "num_tokens": 85142173.0, + "reward": 2.987659454345703, + "reward_std": 0.13073208928108215, + "rewards/reward_fn/mean": 2.987659454345703, + "rewards/reward_fn/std": 0.13073207437992096, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 194.46875, + "completions/mean_terminated_length": 194.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.19539620239736927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.027264825999736786, + "learning_rate": 7.2636e-06, + "loss": 0.0011, + "num_tokens": 85177772.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1085.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 367.625, + "completions/mean_terminated_length": 367.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.19550228068314415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.027547943871468306, + "learning_rate": 7.2632e-06, + "loss": 0.0731, + "num_tokens": 85233824.0, + "reward": 3.5169026851654053, + "reward_std": 0.6616706848144531, + "rewards/reward_fn/mean": 3.5169026851654053, + "rewards/reward_fn/std": 0.6616706848144531, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 304.5625, + "completions/mean_terminated_length": 304.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.19560835896891907, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.02738300757482648, + "learning_rate": 7.2628e-06, + "loss": 0.1332, + "num_tokens": 85276146.0, + "reward": 3.798513889312744, + "reward_std": 0.4765705466270447, + "rewards/reward_fn/mean": 3.798513889312744, + "rewards/reward_fn/std": 0.47657057642936707, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 320.40625, + "completions/mean_terminated_length": 320.40625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.19571443725469395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.02806469239294529, + "learning_rate": 7.2624e-06, + "loss": 0.0161, + "num_tokens": 85323135.0, + "reward": 3.9619522094726562, + "reward_std": 0.21523013710975647, + "rewards/reward_fn/mean": 3.9619522094726562, + "rewards/reward_fn/std": 0.21523013710975647, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 312.78125, + "completions/mean_terminated_length": 312.78125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.19582051554046886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.02015020337421447, + "learning_rate": 7.261999999999999e-06, + "loss": 0.0248, + "num_tokens": 85371896.0, + "reward": 3.5994532108306885, + "reward_std": 0.672566294670105, + "rewards/reward_fn/mean": 3.5994532108306885, + "rewards/reward_fn/std": 0.672566294670105, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 294.375, + "completions/mean_terminated_length": 294.375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.19592659382624378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.02237825153861195, + "learning_rate": 7.261599999999999e-06, + "loss": 0.0013, + "num_tokens": 85417892.0, + "reward": 3.958686351776123, + "reward_std": 0.23370474576950073, + "rewards/reward_fn/mean": 3.958686351776123, + "rewards/reward_fn/std": 0.23370479047298431, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 368.375, + "completions/mean_terminated_length": 368.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.19603267211201866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.025957881240174174, + "learning_rate": 7.261199999999999e-06, + "loss": -0.0124, + "num_tokens": 85469008.0, + "reward": 2.98433256149292, + "reward_std": 0.6673187017440796, + "rewards/reward_fn/mean": 2.98433256149292, + "rewards/reward_fn/std": 0.6673187017440796, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.19613875039779358, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.03066243208013475, + "learning_rate": 7.260799999999999e-06, + "loss": 0.0189, + "num_tokens": 85520032.0, + "reward": 3.3312742710113525, + "reward_std": 0.5270886421203613, + "rewards/reward_fn/mean": 3.3312742710113525, + "rewards/reward_fn/std": 0.5270887017250061, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 292.875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.19624482868356846, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.02144060772843659, + "learning_rate": 7.260399999999999e-06, + "loss": 0.0602, + "num_tokens": 85556412.0, + "reward": 3.9658844470977783, + "reward_std": 0.19298657774925232, + "rewards/reward_fn/mean": 3.9658844470977783, + "rewards/reward_fn/std": 0.19298657774925232, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 161.28125, + "completions/mean_terminated_length": 161.28125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.19635090696934338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.021163585828617215, + "learning_rate": 7.259999999999999e-06, + "loss": -0.0845, + "num_tokens": 85610405.0, + "reward": 2.866489887237549, + "reward_std": 0.5268675684928894, + "rewards/reward_fn/mean": 2.866489887237549, + "rewards/reward_fn/std": 0.5268676280975342, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1304.0, + "completions/max_terminated_length": 1304.0, + "completions/mean_length": 469.71875, + "completions/mean_terminated_length": 469.71875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.1964569852551183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.017452375264838338, + "learning_rate": 7.2596e-06, + "loss": 0.0485, + "num_tokens": 85657788.0, + "reward": 3.1726508140563965, + "reward_std": 0.7422636151313782, + "rewards/reward_fn/mean": 3.1726508140563965, + "rewards/reward_fn/std": 0.7422636151313782, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 435.03125, + "completions/mean_terminated_length": 435.03125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.19656306354089317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.023819874972105026, + "learning_rate": 7.2592e-06, + "loss": -0.0031, + "num_tokens": 85716285.0, + "reward": 3.243497848510742, + "reward_std": 0.48213231563568115, + "rewards/reward_fn/mean": 3.243497848510742, + "rewards/reward_fn/std": 0.48213231563568115, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 244.375, + "completions/mean_terminated_length": 244.375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.1966691418266681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.02474185894243419, + "learning_rate": 7.2588e-06, + "loss": 0.001, + "num_tokens": 85761929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1524.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 437.03125, + "completions/mean_terminated_length": 437.03125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.19677522011244297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.024578659562394023, + "learning_rate": 7.2584e-06, + "loss": -0.0169, + "num_tokens": 85827050.0, + "reward": 3.555436134338379, + "reward_std": 0.7116384506225586, + "rewards/reward_fn/mean": 3.555436134338379, + "rewards/reward_fn/std": 0.7116385102272034, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 308.8125, + "completions/mean_terminated_length": 308.8125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.19688129839821789, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.02469022199511528, + "learning_rate": 7.258e-06, + "loss": 0.0686, + "num_tokens": 85877028.0, + "reward": 2.7643117904663086, + "reward_std": 0.04094107821583748, + "rewards/reward_fn/mean": 2.7643117904663086, + "rewards/reward_fn/std": 0.0409410260617733, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 129.3125, + "completions/mean_terminated_length": 129.3125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.1969873766839928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.02429241011850536, + "learning_rate": 7.2576e-06, + "loss": 0.001, + "num_tokens": 85896526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 231.03125, + "completions/mean_terminated_length": 231.03125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.19709345496976768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.02231363148894161, + "learning_rate": 7.2572e-06, + "loss": 0.0009, + "num_tokens": 85946607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 171.71875, + "completions/mean_terminated_length": 171.71875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1971995332555426, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.34375, + "kl": 0.02453322766814381, + "learning_rate": 7.2568e-06, + "loss": 0.2765, + "num_tokens": 85989382.0, + "reward": 3.923966646194458, + "reward_std": 0.2992479205131531, + "rewards/reward_fn/mean": 3.923966646194458, + "rewards/reward_fn/std": 0.2992479205131531, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 283.9375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.19730561154131748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.025952878408133984, + "learning_rate": 7.2564e-06, + "loss": 0.0263, + "num_tokens": 86037284.0, + "reward": 3.002805233001709, + "reward_std": 0.48838910460472107, + "rewards/reward_fn/mean": 3.002805233001709, + "rewards/reward_fn/std": 0.4883890450000763, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 319.28125, + "completions/mean_terminated_length": 319.28125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.1974116898270924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.02484931843355298, + "learning_rate": 7.256e-06, + "loss": 0.0074, + "num_tokens": 86085517.0, + "reward": 3.792180061340332, + "reward_std": 0.5296205878257751, + "rewards/reward_fn/mean": 3.792180061340332, + "rewards/reward_fn/std": 0.5296205878257751, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 391.53125, + "completions/mean_terminated_length": 338.0967712402344, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1975177681128673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.023601802764460444, + "learning_rate": 7.2556e-06, + "loss": 0.3086, + "num_tokens": 86129406.0, + "reward": 3.803504228591919, + "reward_std": 0.8031951189041138, + "rewards/reward_fn/mean": 3.803504228591919, + "rewards/reward_fn/std": 0.8031951785087585, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.1976238463986422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.028308047214522958, + "learning_rate": 7.2552e-06, + "loss": 0.0011, + "num_tokens": 86176090.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 425.875, + "completions/mean_terminated_length": 425.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.1977299246844171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.022070118226110935, + "learning_rate": 7.2548e-06, + "loss": 0.0846, + "num_tokens": 86221430.0, + "reward": 3.93414306640625, + "reward_std": 0.25952455401420593, + "rewards/reward_fn/mean": 3.93414306640625, + "rewards/reward_fn/std": 0.25952455401420593, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1249.0, + "completions/max_terminated_length": 1249.0, + "completions/mean_length": 464.375, + "completions/mean_terminated_length": 464.375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.197836002970192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.026702645933255553, + "learning_rate": 7.2544e-06, + "loss": 0.0719, + "num_tokens": 86280162.0, + "reward": 3.4960038661956787, + "reward_std": 0.583372175693512, + "rewards/reward_fn/mean": 3.4960038661956787, + "rewards/reward_fn/std": 0.583372175693512, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 268.34375, + "completions/mean_terminated_length": 268.34375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1979420812559669, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.03491222928278148, + "learning_rate": 7.2539999999999995e-06, + "loss": -0.0919, + "num_tokens": 86320909.0, + "reward": 3.5736351013183594, + "reward_std": 0.4795069098472595, + "rewards/reward_fn/mean": 3.5736351013183594, + "rewards/reward_fn/std": 0.4795069098472595, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 301.21875, + "completions/mean_terminated_length": 301.21875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.1980481595417418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.023877075407654047, + "learning_rate": 7.2535999999999995e-06, + "loss": -0.022, + "num_tokens": 86362292.0, + "reward": 3.648834228515625, + "reward_std": 0.529570996761322, + "rewards/reward_fn/mean": 3.648834228515625, + "rewards/reward_fn/std": 0.5295709371566772, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1631.0, + "completions/max_terminated_length": 1631.0, + "completions/mean_length": 284.46875, + "completions/mean_terminated_length": 284.46875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.1981542378275167, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.03326743561774492, + "learning_rate": 7.2531999999999994e-06, + "loss": 0.0016, + "num_tokens": 86419203.0, + "reward": 2.758133888244629, + "reward_std": 0.042066995054483414, + "rewards/reward_fn/mean": 2.758133888244629, + "rewards/reward_fn/std": 0.04206700250506401, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 166.84375, + "completions/mean_terminated_length": 166.84375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.19826031611329162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.023756607668474317, + "learning_rate": 7.252799999999999e-06, + "loss": 0.001, + "num_tokens": 86455902.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 290.03125, + "completions/mean_terminated_length": 290.03125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.1983663943990665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.02618178352713585, + "learning_rate": 7.252399999999999e-06, + "loss": 0.0031, + "num_tokens": 86495359.0, + "reward": 2.5693161487579346, + "reward_std": 0.526289165019989, + "rewards/reward_fn/mean": 2.5693161487579346, + "rewards/reward_fn/std": 0.526289165019989, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 240.4375, + "completions/mean_terminated_length": 240.4375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.19847247268484142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.02834569150581956, + "learning_rate": 7.251999999999999e-06, + "loss": -0.0177, + "num_tokens": 86530893.0, + "reward": 3.391087055206299, + "reward_std": 0.46350401639938354, + "rewards/reward_fn/mean": 3.391087055206299, + "rewards/reward_fn/std": 0.46350395679473877, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 240.71875, + "completions/mean_terminated_length": 240.71875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.1985785509706163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.024594660149887204, + "learning_rate": 7.251599999999999e-06, + "loss": 0.001, + "num_tokens": 86558980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 278.96875, + "completions/mean_terminated_length": 278.96875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.19868462925639122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.02399196708574891, + "learning_rate": 7.251199999999999e-06, + "loss": 0.001, + "num_tokens": 86602883.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 296.53125, + "completions/mean_terminated_length": 296.53125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.19879070754216613, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.029965325025841594, + "learning_rate": 7.250799999999999e-06, + "loss": 0.0653, + "num_tokens": 86649364.0, + "reward": 3.2196264266967773, + "reward_std": 0.19695112109184265, + "rewards/reward_fn/mean": 3.2196264266967773, + "rewards/reward_fn/std": 0.19695109128952026, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 279.03125, + "completions/mean_terminated_length": 279.03125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.19889678582794101, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.02199955377727747, + "learning_rate": 7.250399999999999e-06, + "loss": -0.065, + "num_tokens": 86696021.0, + "reward": 2.7779619693756104, + "reward_std": 0.1980944573879242, + "rewards/reward_fn/mean": 2.7779619693756104, + "rewards/reward_fn/std": 0.19809450209140778, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 601.0, + "completions/mean_terminated_length": 554.3225708007812, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.19900286411371593, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.026174088707193732, + "learning_rate": 7.25e-06, + "loss": 0.1704, + "num_tokens": 86767861.0, + "reward": 2.664578914642334, + "reward_std": 0.6315154433250427, + "rewards/reward_fn/mean": 2.664578914642334, + "rewards/reward_fn/std": 0.6315154433250427, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 190.21875, + "completions/mean_terminated_length": 190.21875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1991089423994908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.02615090156905353, + "learning_rate": 7.2496e-06, + "loss": 0.001, + "num_tokens": 86796092.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 94.09375, + "completions/mean_terminated_length": 94.09375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.19921502068526573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.017111393972299993, + "learning_rate": 7.2492e-06, + "loss": 0.0007, + "num_tokens": 86830815.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 305.34375, + "completions/mean_terminated_length": 305.34375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.19932109897104064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.024205820402130485, + "learning_rate": 7.2488e-06, + "loss": 0.002, + "num_tokens": 86879946.0, + "reward": 3.9707565307617188, + "reward_std": 0.165426567196846, + "rewards/reward_fn/mean": 3.9707565307617188, + "rewards/reward_fn/std": 0.16542655229568481, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.0, + "completions/max_terminated_length": 1539.0, + "completions/mean_length": 450.59375, + "completions/mean_terminated_length": 450.59375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.19942717725681552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.021435604197904468, + "learning_rate": 7.2484e-06, + "loss": 0.0263, + "num_tokens": 86929725.0, + "reward": 3.7882189750671387, + "reward_std": 0.4483034312725067, + "rewards/reward_fn/mean": 3.7882189750671387, + "rewards/reward_fn/std": 0.4483034312725067, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 262.28125, + "completions/mean_terminated_length": 262.28125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.19953325554259044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.023225291399285197, + "learning_rate": 7.248e-06, + "loss": 0.0906, + "num_tokens": 86988614.0, + "reward": 3.082364559173584, + "reward_std": 0.5728388428688049, + "rewards/reward_fn/mean": 3.082364559173584, + "rewards/reward_fn/std": 0.5728388428688049, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 569.875, + "completions/mean_terminated_length": 569.875, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.19963933382836532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.021053237840533257, + "learning_rate": 7.2476e-06, + "loss": 0.1207, + "num_tokens": 87042786.0, + "reward": 2.95101261138916, + "reward_std": 0.061616383492946625, + "rewards/reward_fn/mean": 2.95101261138916, + "rewards/reward_fn/std": 0.061616357415914536, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 692.28125, + "completions/mean_terminated_length": 648.54833984375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.19974541211414024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.014040675945580006, + "learning_rate": 7.2472e-06, + "loss": 0.2063, + "num_tokens": 87110667.0, + "reward": 3.5004072189331055, + "reward_std": 1.0367085933685303, + "rewards/reward_fn/mean": 3.5004072189331055, + "rewards/reward_fn/std": 1.0367085933685303, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.19985149039991515, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.03067116648890078, + "learning_rate": 7.2468e-06, + "loss": 0.0167, + "num_tokens": 87171843.0, + "reward": 3.351804733276367, + "reward_std": 0.448964923620224, + "rewards/reward_fn/mean": 3.351804733276367, + "rewards/reward_fn/std": 0.448964923620224, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 567.3125, + "completions/mean_terminated_length": 519.5484008789062, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.19995756868569003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.0241457661613822, + "learning_rate": 7.2464e-06, + "loss": 0.0926, + "num_tokens": 87223469.0, + "reward": 2.813939094543457, + "reward_std": 0.8173084855079651, + "rewards/reward_fn/mean": 2.813939094543457, + "rewards/reward_fn/std": 0.8173085451126099, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1567.0, + "completions/max_terminated_length": 1567.0, + "completions/mean_length": 442.90625, + "completions/mean_terminated_length": 442.90625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.20006364697146495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.024024329613894224, + "learning_rate": 7.246e-06, + "loss": 0.2072, + "num_tokens": 87270762.0, + "reward": 2.7583117485046387, + "reward_std": 0.26737165451049805, + "rewards/reward_fn/mean": 2.7583117485046387, + "rewards/reward_fn/std": 0.26737165451049805, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1191.0, + "completions/max_terminated_length": 1191.0, + "completions/mean_length": 361.5625, + "completions/mean_terminated_length": 361.5625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.20016972525723983, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.026753748068585992, + "learning_rate": 7.245599999999999e-06, + "loss": -0.1456, + "num_tokens": 87316988.0, + "reward": 2.71528959274292, + "reward_std": 0.49664467573165894, + "rewards/reward_fn/mean": 2.71528959274292, + "rewards/reward_fn/std": 0.49664464592933655, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 196.3125, + "completions/mean_terminated_length": 196.3125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.20027580354301475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.02518186066299677, + "learning_rate": 7.2452e-06, + "loss": 0.001, + "num_tokens": 87361126.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 223.0, + "completions/mean_terminated_length": 223.0, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.20038188182878966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.03339630598202348, + "learning_rate": 7.2448e-06, + "loss": 0.0036, + "num_tokens": 87409702.0, + "reward": 3.3883249759674072, + "reward_std": 0.5844486355781555, + "rewards/reward_fn/mean": 3.3883249759674072, + "rewards/reward_fn/std": 0.5844485759735107, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 362.40625, + "completions/mean_terminated_length": 362.40625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.20048796011456455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.03504353458993137, + "learning_rate": 7.2444e-06, + "loss": 0.0972, + "num_tokens": 87458163.0, + "reward": 3.851999282836914, + "reward_std": 0.351296991109848, + "rewards/reward_fn/mean": 3.851999282836914, + "rewards/reward_fn/std": 0.35129696130752563, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.20059403840033946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.028511138632893562, + "learning_rate": 7.244e-06, + "loss": 0.0735, + "num_tokens": 87498813.0, + "reward": 3.1870431900024414, + "reward_std": 0.5579171776771545, + "rewards/reward_fn/mean": 3.1870431900024414, + "rewards/reward_fn/std": 0.5579171776771545, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1480.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 343.09375, + "completions/mean_terminated_length": 343.09375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.20070011668611434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.024244441650807858, + "learning_rate": 7.2435999999999996e-06, + "loss": 0.0012, + "num_tokens": 87523360.0, + "reward": 3.2999448776245117, + "reward_std": 0.7095201015472412, + "rewards/reward_fn/mean": 3.2999448776245117, + "rewards/reward_fn/std": 0.7095201015472412, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 260.5, + "completions/mean_terminated_length": 260.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.20080619497188926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.027942111948505044, + "learning_rate": 7.2431999999999995e-06, + "loss": 0.0428, + "num_tokens": 87573488.0, + "reward": 2.9267218112945557, + "reward_std": 0.04453163594007492, + "rewards/reward_fn/mean": 2.9267218112945557, + "rewards/reward_fn/std": 0.04453163221478462, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 259.25, + "completions/mean_terminated_length": 259.25, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.20091227325766414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.02555936831049621, + "learning_rate": 7.2427999999999995e-06, + "loss": 0.0033, + "num_tokens": 87616728.0, + "reward": 3.8891286849975586, + "reward_std": 0.2994978129863739, + "rewards/reward_fn/mean": 3.8891286849975586, + "rewards/reward_fn/std": 0.2994977831840515, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 87.6875, + "completions/mean_terminated_length": 87.6875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.20101835154343906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.026342453667894006, + "learning_rate": 7.2423999999999995e-06, + "loss": 0.0011, + "num_tokens": 87654158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 492.625, + "completions/mean_terminated_length": 442.45159912109375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.20112442982921397, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.03021831950172782, + "learning_rate": 7.2419999999999994e-06, + "loss": 0.2815, + "num_tokens": 87704802.0, + "reward": 2.7129340171813965, + "reward_std": 0.6130638718605042, + "rewards/reward_fn/mean": 2.7129340171813965, + "rewards/reward_fn/std": 0.6130638122558594, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 184.53125, + "completions/mean_terminated_length": 184.53125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.20123050811498885, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.03523435001261532, + "learning_rate": 7.241599999999999e-06, + "loss": -0.0579, + "num_tokens": 87740019.0, + "reward": 2.7257957458496094, + "reward_std": 0.22360388934612274, + "rewards/reward_fn/mean": 2.7257957458496094, + "rewards/reward_fn/std": 0.22360387444496155, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 96.75, + "completions/mean_terminated_length": 96.75, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.20133658640076377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.02294772327877581, + "learning_rate": 7.241199999999999e-06, + "loss": 0.0009, + "num_tokens": 87785803.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 138.9375, + "completions/mean_terminated_length": 138.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.20144266468653865, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.01800387806724757, + "learning_rate": 7.2408e-06, + "loss": 0.0354, + "num_tokens": 87826473.0, + "reward": 3.935486316680908, + "reward_std": 0.25387680530548096, + "rewards/reward_fn/mean": 3.935486316680908, + "rewards/reward_fn/std": 0.25387680530548096, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 369.71875, + "completions/mean_terminated_length": 369.71875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.20154874297231357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.020978798624128103, + "learning_rate": 7.2404e-06, + "loss": -0.0053, + "num_tokens": 87872256.0, + "reward": 2.899190664291382, + "reward_std": 0.29119160771369934, + "rewards/reward_fn/mean": 2.899190664291382, + "rewards/reward_fn/std": 0.29119154810905457, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 101.96875, + "completions/mean_terminated_length": 101.96875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.20165482125808848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.02071715716738254, + "learning_rate": 7.24e-06, + "loss": 0.0008, + "num_tokens": 87902847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 192.15625, + "completions/mean_terminated_length": 192.15625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.20176089954386336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.02114151930436492, + "learning_rate": 7.2396e-06, + "loss": 0.0038, + "num_tokens": 87947396.0, + "reward": 3.9619932174682617, + "reward_std": 0.2149982750415802, + "rewards/reward_fn/mean": 3.9619932174682617, + "rewards/reward_fn/std": 0.2149982899427414, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 135.125, + "completions/mean_terminated_length": 135.125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.20186697782963828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.020640159607864916, + "learning_rate": 7.2392e-06, + "loss": 0.1303, + "num_tokens": 87990408.0, + "reward": 2.981785774230957, + "reward_std": 0.03771773725748062, + "rewards/reward_fn/mean": 2.981785774230957, + "rewards/reward_fn/std": 0.037717726081609726, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 382.84375, + "completions/mean_terminated_length": 382.84375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.20197305611541316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.022791220573708415, + "learning_rate": 7.2388e-06, + "loss": 0.0422, + "num_tokens": 88035939.0, + "reward": 3.544314384460449, + "reward_std": 0.5268944501876831, + "rewards/reward_fn/mean": 3.544314384460449, + "rewards/reward_fn/std": 0.5268945097923279, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.20207913440118808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.021082836901769042, + "learning_rate": 7.2384e-06, + "loss": 0.0008, + "num_tokens": 88080631.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1928.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 536.375, + "completions/mean_terminated_length": 536.375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.202185212686963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.023393918527290225, + "learning_rate": 7.238e-06, + "loss": -0.0174, + "num_tokens": 88133123.0, + "reward": 2.6614913940429688, + "reward_std": 0.3483014404773712, + "rewards/reward_fn/mean": 2.6614913940429688, + "rewards/reward_fn/std": 0.34830138087272644, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1456.0, + "completions/mean_length": 763.9375, + "completions/mean_terminated_length": 722.51611328125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.20229129097273787, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.016707264934666455, + "learning_rate": 7.237599999999999e-06, + "loss": 0.0774, + "num_tokens": 88201729.0, + "reward": 2.7706775665283203, + "reward_std": 0.7481005787849426, + "rewards/reward_fn/mean": 2.7706775665283203, + "rewards/reward_fn/std": 0.7481005787849426, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 193.75, + "completions/mean_terminated_length": 193.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.2023973692585128, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.026929725194349885, + "learning_rate": 7.237199999999999e-06, + "loss": 0.0431, + "num_tokens": 88235385.0, + "reward": 3.965723991394043, + "reward_std": 0.19389450550079346, + "rewards/reward_fn/mean": 3.965723991394043, + "rewards/reward_fn/std": 0.19389450550079346, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 181.1875, + "completions/mean_terminated_length": 181.1875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.20250344754428767, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.026674387976527214, + "learning_rate": 7.236799999999999e-06, + "loss": 0.02, + "num_tokens": 88278111.0, + "reward": 3.9655656814575195, + "reward_std": 0.19478978216648102, + "rewards/reward_fn/mean": 3.9655656814575195, + "rewards/reward_fn/std": 0.19478978216648102, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 374.9375, + "completions/mean_terminated_length": 374.9375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.2026095258300626, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.01570185914169997, + "learning_rate": 7.236399999999999e-06, + "loss": 0.0236, + "num_tokens": 88323005.0, + "reward": 2.7527647018432617, + "reward_std": 0.04668011888861656, + "rewards/reward_fn/mean": 2.7527647018432617, + "rewards/reward_fn/std": 0.04668007418513298, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 85.625, + "completions/mean_terminated_length": 85.625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2027156041158375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.01357071875827387, + "learning_rate": 7.236e-06, + "loss": 0.0005, + "num_tokens": 88352881.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 265.125, + "completions/mean_terminated_length": 265.125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.20282168240161239, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.02683391165919602, + "learning_rate": 7.2356e-06, + "loss": -0.0249, + "num_tokens": 88376053.0, + "reward": 2.970895290374756, + "reward_std": 0.605861485004425, + "rewards/reward_fn/mean": 2.970895290374756, + "rewards/reward_fn/std": 0.605861485004425, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 226.9375, + "completions/mean_terminated_length": 226.9375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2029277606873873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.018975378945469856, + "learning_rate": 7.2352e-06, + "loss": 0.0008, + "num_tokens": 88404915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 235.1875, + "completions/mean_terminated_length": 235.1875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.20303383897316218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.021373262396082282, + "learning_rate": 7.2348e-06, + "loss": 0.0009, + "num_tokens": 88457657.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 377.4375, + "completions/mean_terminated_length": 377.4375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.2031399172589371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.019720564829185605, + "learning_rate": 7.2344e-06, + "loss": -0.0058, + "num_tokens": 88506119.0, + "reward": 3.7221968173980713, + "reward_std": 0.7467592358589172, + "rewards/reward_fn/mean": 3.7221968173980713, + "rewards/reward_fn/std": 0.7467593550682068, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 450.9375, + "completions/mean_terminated_length": 450.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.203245995544712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.01790036354213953, + "learning_rate": 7.234e-06, + "loss": -0.0531, + "num_tokens": 88541797.0, + "reward": 2.7362747192382812, + "reward_std": 0.4555523097515106, + "rewards/reward_fn/mean": 2.7362747192382812, + "rewards/reward_fn/std": 0.4555523693561554, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 329.84375, + "completions/mean_terminated_length": 329.84375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2033520738304869, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.017196921980939806, + "learning_rate": 7.2336e-06, + "loss": -0.0422, + "num_tokens": 88603456.0, + "reward": 3.38254451751709, + "reward_std": 0.9729898571968079, + "rewards/reward_fn/mean": 3.38254451751709, + "rewards/reward_fn/std": 0.9729898571968079, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.2034581521162618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.023815185064449906, + "learning_rate": 7.2332e-06, + "loss": -0.014, + "num_tokens": 88651740.0, + "reward": 3.157181739807129, + "reward_std": 0.4949103593826294, + "rewards/reward_fn/mean": 3.157181739807129, + "rewards/reward_fn/std": 0.4949103593826294, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 202.375, + "completions/mean_terminated_length": 202.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2035642304020367, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.027365016983821988, + "learning_rate": 7.2328e-06, + "loss": 0.0196, + "num_tokens": 88678088.0, + "reward": 3.5471441745758057, + "reward_std": 0.5224708914756775, + "rewards/reward_fn/mean": 3.5471441745758057, + "rewards/reward_fn/std": 0.5224708914756775, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 690.8125, + "completions/mean_terminated_length": 550.413818359375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.2036703086878116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.02398996870033443, + "learning_rate": 7.2323999999999996e-06, + "loss": 0.1143, + "num_tokens": 88753698.0, + "reward": 1.9489907026290894, + "reward_std": 0.5881763696670532, + "rewards/reward_fn/mean": 1.9489907026290894, + "rewards/reward_fn/std": 0.5881763696670532, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 315.375, + "completions/mean_terminated_length": 315.375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.2037763869735865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.024634240893647075, + "learning_rate": 7.2319999999999995e-06, + "loss": 0.1392, + "num_tokens": 88810734.0, + "reward": 3.686619281768799, + "reward_std": 0.5119209289550781, + "rewards/reward_fn/mean": 3.686619281768799, + "rewards/reward_fn/std": 0.5119208693504333, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 240.15625, + "completions/mean_terminated_length": 240.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2038824652593614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02241891936864704, + "learning_rate": 7.2315999999999995e-06, + "loss": 0.0191, + "num_tokens": 88852499.0, + "reward": 3.011032819747925, + "reward_std": 0.3259943425655365, + "rewards/reward_fn/mean": 3.011032819747925, + "rewards/reward_fn/std": 0.3259943425655365, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 310.5, + "completions/mean_terminated_length": 310.5, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.20398854354513632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.022662291070446372, + "learning_rate": 7.2312e-06, + "loss": 0.0401, + "num_tokens": 88901475.0, + "reward": 3.343940258026123, + "reward_std": 0.7196161150932312, + "rewards/reward_fn/mean": 3.343940258026123, + "rewards/reward_fn/std": 0.7196161150932312, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 229.6875, + "completions/mean_terminated_length": 229.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.2040946218309112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.027691754046827555, + "learning_rate": 7.2308e-06, + "loss": 0.0728, + "num_tokens": 88939737.0, + "reward": 3.8904521465301514, + "reward_std": 0.34605592489242554, + "rewards/reward_fn/mean": 3.8904521465301514, + "rewards/reward_fn/std": 0.34605586528778076, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 129.96875, + "completions/mean_terminated_length": 129.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.20420070011668612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.022279798751696944, + "learning_rate": 7.2304e-06, + "loss": 0.0009, + "num_tokens": 88987640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 299.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.204306778402461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.026795195881277323, + "learning_rate": 7.23e-06, + "loss": 0.0406, + "num_tokens": 89030320.0, + "reward": 2.822610378265381, + "reward_std": 0.061767082661390305, + "rewards/reward_fn/mean": 2.822610378265381, + "rewards/reward_fn/std": 0.06176706776022911, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 260.1875, + "completions/mean_terminated_length": 260.1875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.20441285668823592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.02193640125915408, + "learning_rate": 7.229599999999999e-06, + "loss": 0.0855, + "num_tokens": 89073910.0, + "reward": 3.5822935104370117, + "reward_std": 0.5866038799285889, + "rewards/reward_fn/mean": 3.5822935104370117, + "rewards/reward_fn/std": 0.5866038799285889, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 391.15625, + "completions/mean_terminated_length": 391.15625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.20451893497401083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.02066147024743259, + "learning_rate": 7.229199999999999e-06, + "loss": 0.0274, + "num_tokens": 89120123.0, + "reward": 3.036647319793701, + "reward_std": 0.44018828868865967, + "rewards/reward_fn/mean": 3.036647319793701, + "rewards/reward_fn/std": 0.4401882588863373, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 418.71875, + "completions/mean_terminated_length": 418.71875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.20462501325978572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.018091494566760957, + "learning_rate": 7.228799999999999e-06, + "loss": 0.0708, + "num_tokens": 89180338.0, + "reward": 3.9663846492767334, + "reward_std": 0.19015701115131378, + "rewards/reward_fn/mean": 3.9663846492767334, + "rewards/reward_fn/std": 0.19015701115131378, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 166.59375, + "completions/mean_terminated_length": 166.59375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.20473109154556063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.020806429674848914, + "learning_rate": 7.228399999999999e-06, + "loss": 0.0008, + "num_tokens": 89217669.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 289.3125, + "completions/mean_terminated_length": 289.3125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2048371698313355, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02973605995066464, + "learning_rate": 7.227999999999999e-06, + "loss": 0.0377, + "num_tokens": 89254255.0, + "reward": 2.971668243408203, + "reward_std": 0.39661845564842224, + "rewards/reward_fn/mean": 2.971668243408203, + "rewards/reward_fn/std": 0.39661842584609985, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 73.0625, + "completions/mean_terminated_length": 73.0625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.20494324811711043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.01642139005707577, + "learning_rate": 7.227599999999999e-06, + "loss": 0.0007, + "num_tokens": 89277937.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 182.625, + "completions/mean_terminated_length": 182.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.20504932640288534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.022830116329714656, + "learning_rate": 7.227199999999999e-06, + "loss": 0.0009, + "num_tokens": 89323109.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 358.59375, + "completions/mean_terminated_length": 358.59375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.20515540468866023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.025236302288249135, + "learning_rate": 7.226799999999999e-06, + "loss": 0.0009, + "num_tokens": 89368024.0, + "reward": 3.8563036918640137, + "reward_std": 0.48152342438697815, + "rewards/reward_fn/mean": 3.8563036918640137, + "rewards/reward_fn/std": 0.48152339458465576, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 113.4375, + "completions/mean_terminated_length": 113.4375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.20526148297443514, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.025299014407210052, + "learning_rate": 7.2264e-06, + "loss": -0.0658, + "num_tokens": 89409446.0, + "reward": 3.0979630947113037, + "reward_std": 1.1084299087524414, + "rewards/reward_fn/mean": 3.0979630947113037, + "rewards/reward_fn/std": 1.1084297895431519, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 205.46875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.20536756126021002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.022546561784110963, + "learning_rate": 7.226e-06, + "loss": 0.0005, + "num_tokens": 89460885.0, + "reward": 3.048456907272339, + "reward_std": 0.3682403266429901, + "rewards/reward_fn/mean": 3.048456907272339, + "rewards/reward_fn/std": 0.3682402968406677, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1415.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 530.9375, + "completions/mean_terminated_length": 530.9375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.20547363954598494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.021520751295611262, + "learning_rate": 7.2256e-06, + "loss": -0.0159, + "num_tokens": 89514739.0, + "reward": 3.869535207748413, + "reward_std": 0.31333673000335693, + "rewards/reward_fn/mean": 3.869535207748413, + "rewards/reward_fn/std": 0.31333670020103455, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 213.75, + "completions/mean_terminated_length": 213.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.20557971783175985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.01827068265993148, + "learning_rate": 7.2252e-06, + "loss": 0.0007, + "num_tokens": 89556011.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 70.53125, + "completions/mean_terminated_length": 70.53125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.20568579611753474, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8125, + "kl": 0.02340351662132889, + "learning_rate": 7.2248e-06, + "loss": -0.096, + "num_tokens": 89607260.0, + "reward": 3.724715232849121, + "reward_std": 0.4149995446205139, + "rewards/reward_fn/mean": 3.724715232849121, + "rewards/reward_fn/std": 0.41499951481819153, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 94.125, + "completions/mean_terminated_length": 94.125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.20579187440330965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.017246187082491815, + "learning_rate": 7.2244e-06, + "loss": 0.0007, + "num_tokens": 89646368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 359.59375, + "completions/mean_terminated_length": 359.59375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.20589795268908453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.022477731574326754, + "learning_rate": 7.224e-06, + "loss": 0.0377, + "num_tokens": 89695955.0, + "reward": 3.152523994445801, + "reward_std": 0.6828972697257996, + "rewards/reward_fn/mean": 3.152523994445801, + "rewards/reward_fn/std": 0.6828973293304443, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 238.0, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.20600403097485945, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.025746502447873354, + "learning_rate": 7.2236e-06, + "loss": -0.1531, + "num_tokens": 89738867.0, + "reward": 3.314828872680664, + "reward_std": 0.5770388841629028, + "rewards/reward_fn/mean": 3.314828872680664, + "rewards/reward_fn/std": 0.5770388841629028, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 340.0625, + "completions/mean_terminated_length": 340.0625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.20611010926063436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.017551672644913197, + "learning_rate": 7.2232e-06, + "loss": 0.1222, + "num_tokens": 89787157.0, + "reward": 3.962864875793457, + "reward_std": 0.21006862819194794, + "rewards/reward_fn/mean": 3.962864875793457, + "rewards/reward_fn/std": 0.21006861329078674, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 458.9375, + "completions/mean_terminated_length": 407.6773986816406, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.20621618754640925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.018971068551763892, + "learning_rate": 7.2228e-06, + "loss": 0.2706, + "num_tokens": 89837971.0, + "reward": 3.80379319190979, + "reward_std": 0.8024195432662964, + "rewards/reward_fn/mean": 3.80379319190979, + "rewards/reward_fn/std": 0.8024195432662964, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 272.4375, + "completions/mean_terminated_length": 272.4375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.20632226583218416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.026358928764238954, + "learning_rate": 7.2224e-06, + "loss": -0.0807, + "num_tokens": 89877921.0, + "reward": 3.8525662422180176, + "reward_std": 0.39788663387298584, + "rewards/reward_fn/mean": 3.8525662422180176, + "rewards/reward_fn/std": 0.39788660407066345, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 303.8125, + "completions/mean_terminated_length": 303.8125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.20642834411795905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.02223000884987414, + "learning_rate": 7.2220000000000005e-06, + "loss": -0.1545, + "num_tokens": 89947163.0, + "reward": 3.1230030059814453, + "reward_std": 0.41615304350852966, + "rewards/reward_fn/mean": 3.1230030059814453, + "rewards/reward_fn/std": 0.4161530137062073, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 252.90625, + "completions/mean_terminated_length": 252.90625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.20653442240373396, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.026710605947300792, + "learning_rate": 7.2216e-06, + "loss": 0.0586, + "num_tokens": 89984600.0, + "reward": 3.95160174369812, + "reward_std": 0.1915348619222641, + "rewards/reward_fn/mean": 3.95160174369812, + "rewards/reward_fn/std": 0.1915348470211029, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 299.71875, + "completions/mean_terminated_length": 299.71875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.20664050068950887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.02253810246475041, + "learning_rate": 7.2211999999999996e-06, + "loss": 0.0526, + "num_tokens": 90031375.0, + "reward": 3.298675537109375, + "reward_std": 0.590697169303894, + "rewards/reward_fn/mean": 3.298675537109375, + "rewards/reward_fn/std": 0.590697169303894, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 257.4375, + "completions/mean_terminated_length": 257.4375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.20674657897528376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02902632998302579, + "learning_rate": 7.2207999999999995e-06, + "loss": 0.0865, + "num_tokens": 90083869.0, + "reward": 3.1455907821655273, + "reward_std": 0.08136258274316788, + "rewards/reward_fn/mean": 3.1455907821655273, + "rewards/reward_fn/std": 0.08136259019374847, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 172.09375, + "completions/mean_terminated_length": 172.09375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.20685265726105867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.018278813688084483, + "learning_rate": 7.2203999999999995e-06, + "loss": 0.0007, + "num_tokens": 90104640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 359.125, + "completions/mean_terminated_length": 359.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.20695873554683356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02860428229905665, + "learning_rate": 7.2199999999999995e-06, + "loss": 0.048, + "num_tokens": 90178148.0, + "reward": 3.1251840591430664, + "reward_std": 0.5875481367111206, + "rewards/reward_fn/mean": 3.1251840591430664, + "rewards/reward_fn/std": 0.5875481367111206, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 159.5625, + "completions/mean_terminated_length": 159.5625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.20706481383260847, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.025235574459657073, + "learning_rate": 7.2195999999999995e-06, + "loss": -0.0463, + "num_tokens": 90223062.0, + "reward": 3.6346235275268555, + "reward_std": 0.5507158041000366, + "rewards/reward_fn/mean": 3.6346235275268555, + "rewards/reward_fn/std": 0.5507158041000366, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 256.8125, + "completions/mean_terminated_length": 256.8125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.20717089211838335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.020091851707547903, + "learning_rate": 7.219199999999999e-06, + "loss": 0.0008, + "num_tokens": 90267120.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 177.78125, + "completions/mean_terminated_length": 177.78125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.20727697040415827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.02827597805298865, + "learning_rate": 7.218799999999999e-06, + "loss": 0.0011, + "num_tokens": 90325865.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 378.53125, + "completions/mean_terminated_length": 378.53125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.20738304868993318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.02135020843707025, + "learning_rate": 7.218399999999999e-06, + "loss": 0.1212, + "num_tokens": 90372186.0, + "reward": 3.716845750808716, + "reward_std": 0.5746200084686279, + "rewards/reward_fn/mean": 3.716845750808716, + "rewards/reward_fn/std": 0.5746200084686279, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.20748912697570807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.022205424727872014, + "learning_rate": 7.217999999999999e-06, + "loss": 0.0699, + "num_tokens": 90414950.0, + "reward": 3.3851747512817383, + "reward_std": 0.5178138017654419, + "rewards/reward_fn/mean": 3.3851747512817383, + "rewards/reward_fn/std": 0.5178138613700867, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 263.96875, + "completions/mean_terminated_length": 263.96875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.20759520526148298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.024157197680324316, + "learning_rate": 7.217599999999999e-06, + "loss": 0.0051, + "num_tokens": 90440677.0, + "reward": 3.5730175971984863, + "reward_std": 0.606890082359314, + "rewards/reward_fn/mean": 3.5730175971984863, + "rewards/reward_fn/std": 0.6068900227546692, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 158.15625, + "completions/mean_terminated_length": 158.15625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.20770128354725786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.028684925520792603, + "learning_rate": 7.2172e-06, + "loss": 0.0011, + "num_tokens": 90494474.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 412.21875, + "completions/mean_terminated_length": 412.21875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.20780736183303278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01932144328020513, + "learning_rate": 7.2168e-06, + "loss": 0.0304, + "num_tokens": 90545521.0, + "reward": 3.859956741333008, + "reward_std": 0.5511422157287598, + "rewards/reward_fn/mean": 3.859956741333008, + "rewards/reward_fn/std": 0.5511422157287598, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 437.4375, + "completions/mean_terminated_length": 437.4375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.2079134401188077, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.01740968832746148, + "learning_rate": 7.2164e-06, + "loss": -0.0076, + "num_tokens": 90596639.0, + "reward": 2.462031841278076, + "reward_std": 0.5236942172050476, + "rewards/reward_fn/mean": 2.462031841278076, + "rewards/reward_fn/std": 0.5236942172050476, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 245.875, + "completions/mean_terminated_length": 245.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.20801951840458258, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.021013896446675062, + "learning_rate": 7.216e-06, + "loss": 0.2187, + "num_tokens": 90631643.0, + "reward": 3.968956470489502, + "reward_std": 0.17560802400112152, + "rewards/reward_fn/mean": 3.968956470489502, + "rewards/reward_fn/std": 0.17560799419879913, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 127.25, + "completions/mean_terminated_length": 127.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2081255966903575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.02872321312315762, + "learning_rate": 7.2156e-06, + "loss": 0.0011, + "num_tokens": 90663715.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 162.03125, + "completions/mean_terminated_length": 162.03125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.20823167497613237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.022123705130070448, + "learning_rate": 7.2152e-06, + "loss": 0.0121, + "num_tokens": 90700004.0, + "reward": 3.959567070007324, + "reward_std": 0.22872252762317657, + "rewards/reward_fn/mean": 3.959567070007324, + "rewards/reward_fn/std": 0.22872251272201538, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1812.0, + "completions/max_terminated_length": 1812.0, + "completions/mean_length": 307.46875, + "completions/mean_terminated_length": 307.46875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.2083377532619073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.029133395524695516, + "learning_rate": 7.2148e-06, + "loss": 0.1083, + "num_tokens": 90749555.0, + "reward": 3.5827994346618652, + "reward_std": 0.5858627557754517, + "rewards/reward_fn/mean": 3.5827994346618652, + "rewards/reward_fn/std": 0.5858627557754517, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 389.84375, + "completions/mean_terminated_length": 389.84375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.2084438315476822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.018395462189801037, + "learning_rate": 7.2144e-06, + "loss": 0.0121, + "num_tokens": 90804846.0, + "reward": 2.98877215385437, + "reward_std": 0.03333742171525955, + "rewards/reward_fn/mean": 2.98877215385437, + "rewards/reward_fn/std": 0.033337417989969254, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 160.375, + "completions/mean_terminated_length": 160.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.2085499098334571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.01872719032689929, + "learning_rate": 7.214e-06, + "loss": 0.0007, + "num_tokens": 90850298.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 73.65625, + "completions/mean_terminated_length": 73.65625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.208655988119232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.015950615401379764, + "learning_rate": 7.213599999999999e-06, + "loss": 0.0006, + "num_tokens": 90886063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 285.78125, + "completions/mean_terminated_length": 285.78125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.20876206640500689, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.020902880001813173, + "learning_rate": 7.213199999999999e-06, + "loss": 0.0817, + "num_tokens": 90931080.0, + "reward": 3.021613597869873, + "reward_std": 0.18543782830238342, + "rewards/reward_fn/mean": 3.021613597869873, + "rewards/reward_fn/std": 0.18543781340122223, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 282.34375, + "completions/mean_terminated_length": 282.34375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2088681446907818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01968630903866142, + "learning_rate": 7.212799999999999e-06, + "loss": 0.0353, + "num_tokens": 90989203.0, + "reward": 2.8288135528564453, + "reward_std": 0.04306629300117493, + "rewards/reward_fn/mean": 2.8288135528564453, + "rewards/reward_fn/std": 0.04306626692414284, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 227.25, + "completions/mean_terminated_length": 227.25, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.2089742229765567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.018828594125807285, + "learning_rate": 7.2124e-06, + "loss": 0.0222, + "num_tokens": 91047163.0, + "reward": 3.745974063873291, + "reward_std": 0.4881126582622528, + "rewards/reward_fn/mean": 3.745974063873291, + "rewards/reward_fn/std": 0.4881126284599304, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 414.4375, + "completions/mean_terminated_length": 414.4375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.2090803012623316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.021128013264387846, + "learning_rate": 7.212e-06, + "loss": 0.0099, + "num_tokens": 91107625.0, + "reward": 3.6306121349334717, + "reward_std": 0.7473952174186707, + "rewards/reward_fn/mean": 3.6306121349334717, + "rewards/reward_fn/std": 0.7473952770233154, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 283.71875, + "completions/mean_terminated_length": 283.71875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.2091863795481065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.014698807732202113, + "learning_rate": 7.2116e-06, + "loss": 0.0006, + "num_tokens": 91172608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 321.625, + "completions/mean_terminated_length": 321.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.2092924578338814, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.021146057173609734, + "learning_rate": 7.2112e-06, + "loss": -0.0184, + "num_tokens": 91230484.0, + "reward": 3.395284414291382, + "reward_std": 0.5098738074302673, + "rewards/reward_fn/mean": 3.395284414291382, + "rewards/reward_fn/std": 0.5098738074302673, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 100.03125, + "completions/mean_terminated_length": 100.03125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2093985361196563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.0216621074359864, + "learning_rate": 7.2108e-06, + "loss": 0.0009, + "num_tokens": 91264469.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1422.0, + "completions/mean_length": 542.25, + "completions/mean_terminated_length": 493.6773986816406, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.20950461440543122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.01485793653409928, + "learning_rate": 7.2104e-06, + "loss": 0.1539, + "num_tokens": 91328349.0, + "reward": 3.7629165649414062, + "reward_std": 0.7722904682159424, + "rewards/reward_fn/mean": 3.7629165649414062, + "rewards/reward_fn/std": 0.7722904682159424, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 108.21875, + "completions/mean_terminated_length": 108.21875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.2096106926912061, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.018597336602397263, + "learning_rate": 7.21e-06, + "loss": -0.0098, + "num_tokens": 91363300.0, + "reward": 3.804640293121338, + "reward_std": 0.34462565183639526, + "rewards/reward_fn/mean": 3.804640293121338, + "rewards/reward_fn/std": 0.3446256220340729, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.20971677097698102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01667685038410127, + "learning_rate": 7.2095999999999995e-06, + "loss": 0.1173, + "num_tokens": 91388605.0, + "reward": 3.961299180984497, + "reward_std": 0.2189248949289322, + "rewards/reward_fn/mean": 3.961299180984497, + "rewards/reward_fn/std": 0.2189248949289322, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 229.0, + "completions/mean_terminated_length": 229.0, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.2098228492627559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.020580741576850414, + "learning_rate": 7.2091999999999995e-06, + "loss": 0.0008, + "num_tokens": 91426109.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 310.21875, + "completions/mean_terminated_length": 310.21875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.20992892754853082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.02507057785987854, + "learning_rate": 7.2087999999999995e-06, + "loss": 0.1101, + "num_tokens": 91482500.0, + "reward": 3.840827465057373, + "reward_std": 0.37968137860298157, + "rewards/reward_fn/mean": 3.840827465057373, + "rewards/reward_fn/std": 0.37968140840530396, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 195.875, + "completions/mean_terminated_length": 195.875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2100350058343057, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.02671630633994937, + "learning_rate": 7.2083999999999995e-06, + "loss": 0.0589, + "num_tokens": 91522784.0, + "reward": 2.8789358139038086, + "reward_std": 0.2997475862503052, + "rewards/reward_fn/mean": 2.8789358139038086, + "rewards/reward_fn/std": 0.2997475862503052, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 149.78125, + "completions/mean_terminated_length": 149.78125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.21014108412008062, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.65625, + "kl": 0.025148641783744097, + "learning_rate": 7.208e-06, + "loss": -0.0394, + "num_tokens": 91563673.0, + "reward": 3.7303476333618164, + "reward_std": 0.26785701513290405, + "rewards/reward_fn/mean": 3.7303476333618164, + "rewards/reward_fn/std": 0.26785698533058167, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 207.4375, + "completions/mean_terminated_length": 207.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.21024716240585553, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.025888599455356598, + "learning_rate": 7.2076e-06, + "loss": 0.1438, + "num_tokens": 91597991.0, + "reward": 3.4232587814331055, + "reward_std": 0.546558678150177, + "rewards/reward_fn/mean": 3.4232587814331055, + "rewards/reward_fn/std": 0.546558678150177, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1835.0, + "completions/max_terminated_length": 1835.0, + "completions/mean_length": 374.125, + "completions/mean_terminated_length": 374.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.21035324069163042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.02524533332325518, + "learning_rate": 7.2072e-06, + "loss": 0.051, + "num_tokens": 91647627.0, + "reward": 2.730729103088379, + "reward_std": 0.4013659358024597, + "rewards/reward_fn/mean": 2.730729103088379, + "rewards/reward_fn/std": 0.4013659656047821, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1921.0, + "completions/mean_length": 715.15625, + "completions/mean_terminated_length": 577.27587890625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.21045931897740533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.02549780602566898, + "learning_rate": 7.2068e-06, + "loss": 0.1905, + "num_tokens": 91701296.0, + "reward": 2.3230574131011963, + "reward_std": 0.7773017883300781, + "rewards/reward_fn/mean": 2.3230574131011963, + "rewards/reward_fn/std": 0.7773017883300781, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 395.34375, + "completions/mean_terminated_length": 395.34375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.21056539726318022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.021748070372268558, + "learning_rate": 7.2064e-06, + "loss": 0.0392, + "num_tokens": 91755099.0, + "reward": 3.6780052185058594, + "reward_std": 0.551239013671875, + "rewards/reward_fn/mean": 3.6780052185058594, + "rewards/reward_fn/std": 0.551239013671875, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 344.25, + "completions/mean_terminated_length": 344.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.21067147554895513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.021804936230182648, + "learning_rate": 7.206e-06, + "loss": 0.0594, + "num_tokens": 91805603.0, + "reward": 2.9031484127044678, + "reward_std": 0.046194564551115036, + "rewards/reward_fn/mean": 2.9031484127044678, + "rewards/reward_fn/std": 0.04619458317756653, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 183.90625, + "completions/mean_terminated_length": 183.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.21077755383473004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.028120714705437422, + "learning_rate": 7.205599999999999e-06, + "loss": 0.0011, + "num_tokens": 91849088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.21088363212050493, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.02706411969847977, + "learning_rate": 7.205199999999999e-06, + "loss": 0.0943, + "num_tokens": 91893915.0, + "reward": 3.962301254272461, + "reward_std": 0.21325629949569702, + "rewards/reward_fn/mean": 3.962301254272461, + "rewards/reward_fn/std": 0.21325626969337463, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.21098971040627984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.020172378746792674, + "learning_rate": 7.204799999999999e-06, + "loss": 0.0691, + "num_tokens": 91930811.0, + "reward": 2.7958617210388184, + "reward_std": 0.03389512747526169, + "rewards/reward_fn/mean": 2.7958617210388184, + "rewards/reward_fn/std": 0.033895138651132584, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 153.4375, + "completions/mean_terminated_length": 153.4375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.21109578869205473, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.875, + "kl": 0.03439820581115782, + "learning_rate": 7.204399999999999e-06, + "loss": 0.1005, + "num_tokens": 91970377.0, + "reward": 3.9778530597686768, + "reward_std": 0.12528198957443237, + "rewards/reward_fn/mean": 3.9778530597686768, + "rewards/reward_fn/std": 0.12528197467327118, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 259.09375, + "completions/mean_terminated_length": 259.09375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.21120186697782964, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.022125726332888007, + "learning_rate": 7.203999999999999e-06, + "loss": -0.1815, + "num_tokens": 92008268.0, + "reward": 3.5986576080322266, + "reward_std": 0.49440956115722656, + "rewards/reward_fn/mean": 3.5986576080322266, + "rewards/reward_fn/std": 0.4944095313549042, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 241.875, + "completions/mean_terminated_length": 241.875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.21130794526360455, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.022150580305606127, + "learning_rate": 7.203599999999999e-06, + "loss": 0.3028, + "num_tokens": 92046408.0, + "reward": 2.9095382690429688, + "reward_std": 0.6566404104232788, + "rewards/reward_fn/mean": 2.9095382690429688, + "rewards/reward_fn/std": 0.6566404104232788, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 261.25, + "completions/mean_terminated_length": 261.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.21141402354937944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.02650432544760406, + "learning_rate": 7.2032e-06, + "loss": -0.0708, + "num_tokens": 92098832.0, + "reward": 3.028238296508789, + "reward_std": 0.47540026903152466, + "rewards/reward_fn/mean": 3.028238296508789, + "rewards/reward_fn/std": 0.4754002094268799, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 246.09375, + "completions/mean_terminated_length": 246.09375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.21152010183515435, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.025201337644830346, + "learning_rate": 7.2028e-06, + "loss": 0.1415, + "num_tokens": 92144115.0, + "reward": 3.697462558746338, + "reward_std": 0.6422297954559326, + "rewards/reward_fn/mean": 3.697462558746338, + "rewards/reward_fn/std": 0.6422297954559326, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.21162618012092924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.022127235773950815, + "learning_rate": 7.2024e-06, + "loss": 0.0009, + "num_tokens": 92197027.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1788.0, + "completions/max_terminated_length": 1788.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.21173225840670415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.027125931112095714, + "learning_rate": 7.202e-06, + "loss": 0.1132, + "num_tokens": 92261455.0, + "reward": 3.7107300758361816, + "reward_std": 0.6581941246986389, + "rewards/reward_fn/mean": 3.7107300758361816, + "rewards/reward_fn/std": 0.6581941246986389, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 324.40625, + "completions/mean_terminated_length": 324.40625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.21183833669247906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.018547830171883106, + "learning_rate": 7.2016e-06, + "loss": -0.0218, + "num_tokens": 92303356.0, + "reward": 3.929755687713623, + "reward_std": 0.2764107584953308, + "rewards/reward_fn/mean": 3.929755687713623, + "rewards/reward_fn/std": 0.2764107286930084, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 219.5625, + "completions/mean_terminated_length": 219.5625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.21194441497825395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.016332231694832444, + "learning_rate": 7.2012e-06, + "loss": 0.0117, + "num_tokens": 92341646.0, + "reward": 2.746156692504883, + "reward_std": 0.042948655784130096, + "rewards/reward_fn/mean": 2.746156692504883, + "rewards/reward_fn/std": 0.0429486408829689, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 303.78125, + "completions/mean_terminated_length": 303.78125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.21205049326402886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.024832285940647125, + "learning_rate": 7.2008e-06, + "loss": 0.1743, + "num_tokens": 92407335.0, + "reward": 3.5415878295898438, + "reward_std": 0.8036985993385315, + "rewards/reward_fn/mean": 3.5415878295898438, + "rewards/reward_fn/std": 0.8036985397338867, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 152.71875, + "completions/mean_terminated_length": 152.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.21215657154980375, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.023261455935426056, + "learning_rate": 7.2004e-06, + "loss": 0.0307, + "num_tokens": 92451390.0, + "reward": 3.0803160667419434, + "reward_std": 0.3557732403278351, + "rewards/reward_fn/mean": 3.0803160667419434, + "rewards/reward_fn/std": 0.3557732403278351, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.21226264983557866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.01706216251477599, + "learning_rate": 7.2e-06, + "loss": 0.0007, + "num_tokens": 92487726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 110.90625, + "completions/mean_terminated_length": 110.90625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.21236872812135357, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.04754210542887449, + "learning_rate": 7.1996e-06, + "loss": 0.1082, + "num_tokens": 92525099.0, + "reward": 3.045430898666382, + "reward_std": 0.03022829256951809, + "rewards/reward_fn/mean": 3.045430898666382, + "rewards/reward_fn/std": 0.030228327959775925, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 633.5625, + "completions/mean_terminated_length": 587.9354858398438, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.21247480640712846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.02199251647107303, + "learning_rate": 7.1992e-06, + "loss": 0.0533, + "num_tokens": 92581405.0, + "reward": 2.586845874786377, + "reward_std": 0.8533264994621277, + "rewards/reward_fn/mean": 2.586845874786377, + "rewards/reward_fn/std": 0.8533264994621277, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 310.8125, + "completions/mean_terminated_length": 310.8125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.21258088469290337, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.02182422927580774, + "learning_rate": 7.1988e-06, + "loss": -0.0255, + "num_tokens": 92610615.0, + "reward": 2.3631041049957275, + "reward_std": 0.5505119562149048, + "rewards/reward_fn/mean": 2.3631041049957275, + "rewards/reward_fn/std": 0.5505119562149048, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 203.15625, + "completions/mean_terminated_length": 203.15625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.21268696297867826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.01677056518383324, + "learning_rate": 7.1984e-06, + "loss": 0.0007, + "num_tokens": 92649212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.21279304126445317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.015512895653955638, + "learning_rate": 7.198e-06, + "loss": -0.0671, + "num_tokens": 92696067.0, + "reward": 3.7533555030822754, + "reward_std": 0.6800351738929749, + "rewards/reward_fn/mean": 3.7533555030822754, + "rewards/reward_fn/std": 0.6800351142883301, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 485.8125, + "completions/mean_terminated_length": 485.8125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.21289911955022806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.028621545527130365, + "learning_rate": 7.1976e-06, + "loss": 0.0727, + "num_tokens": 92746141.0, + "reward": 2.80375337600708, + "reward_std": 0.4395216107368469, + "rewards/reward_fn/mean": 2.80375337600708, + "rewards/reward_fn/std": 0.43952158093452454, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 524.5, + "completions/mean_terminated_length": 524.5, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.21300519783600297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.02100277761928737, + "learning_rate": 7.1971999999999995e-06, + "loss": -0.0492, + "num_tokens": 92799437.0, + "reward": 2.993116855621338, + "reward_std": 0.6083241701126099, + "rewards/reward_fn/mean": 2.993116855621338, + "rewards/reward_fn/std": 0.6083241701126099, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 332.84375, + "completions/mean_terminated_length": 332.84375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.21311127612177788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.028305693296715617, + "learning_rate": 7.1967999999999994e-06, + "loss": 0.0428, + "num_tokens": 92822056.0, + "reward": 3.8866019248962402, + "reward_std": 0.35826677083969116, + "rewards/reward_fn/mean": 3.8866019248962402, + "rewards/reward_fn/std": 0.35826677083969116, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 354.46875, + "completions/mean_terminated_length": 354.46875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.21321735440755277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.019336213706992567, + "learning_rate": 7.196399999999999e-06, + "loss": 0.0278, + "num_tokens": 92879767.0, + "reward": 3.6398813724517822, + "reward_std": 0.5077115297317505, + "rewards/reward_fn/mean": 3.6398813724517822, + "rewards/reward_fn/std": 0.5077115297317505, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2033.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 612.59375, + "completions/mean_terminated_length": 612.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.21332343269332768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.015353482798673213, + "learning_rate": 7.195999999999999e-06, + "loss": -0.0099, + "num_tokens": 92932458.0, + "reward": 3.1320223808288574, + "reward_std": 0.7826876640319824, + "rewards/reward_fn/mean": 3.1320223808288574, + "rewards/reward_fn/std": 0.7826876044273376, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 153.5625, + "completions/mean_terminated_length": 153.5625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.21342951097910257, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.021226009470410645, + "learning_rate": 7.195599999999999e-06, + "loss": -0.0176, + "num_tokens": 92990620.0, + "reward": 3.7923405170440674, + "reward_std": 0.6559759974479675, + "rewards/reward_fn/mean": 3.7923405170440674, + "rewards/reward_fn/std": 0.6559760570526123, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 419.5, + "completions/mean_terminated_length": 419.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.21353558926487748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.02267545904032886, + "learning_rate": 7.195199999999999e-06, + "loss": 0.1904, + "num_tokens": 93033100.0, + "reward": 3.1862945556640625, + "reward_std": 0.47886648774147034, + "rewards/reward_fn/mean": 3.1862945556640625, + "rewards/reward_fn/std": 0.47886648774147034, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 165.3125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2136416675506524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.018245216575451195, + "learning_rate": 7.194799999999999e-06, + "loss": -0.0072, + "num_tokens": 93074614.0, + "reward": 3.9680566787719727, + "reward_std": 0.18069864809513092, + "rewards/reward_fn/mean": 3.9680566787719727, + "rewards/reward_fn/std": 0.18069863319396973, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 83.25, + "completions/mean_terminated_length": 83.25, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.21374774583642728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.01922210631892085, + "learning_rate": 7.194399999999999e-06, + "loss": 0.0008, + "num_tokens": 93115646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 195.34375, + "completions/mean_terminated_length": 195.34375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2138538241222022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.0225376442540437, + "learning_rate": 7.193999999999999e-06, + "loss": 0.0009, + "num_tokens": 93176233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 311.3125, + "completions/mean_terminated_length": 311.3125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.21395990240797708, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.023441240657120943, + "learning_rate": 7.1936e-06, + "loss": 0.1198, + "num_tokens": 93224051.0, + "reward": 3.871459484100342, + "reward_std": 0.3460574746131897, + "rewards/reward_fn/mean": 3.871459484100342, + "rewards/reward_fn/std": 0.3460574448108673, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2015.0, + "completions/max_terminated_length": 2015.0, + "completions/mean_length": 281.15625, + "completions/mean_terminated_length": 281.15625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.214065980693752, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.023096083430573344, + "learning_rate": 7.1932e-06, + "loss": 0.0427, + "num_tokens": 93295032.0, + "reward": 3.9682388305664062, + "reward_std": 0.17966748774051666, + "rewards/reward_fn/mean": 3.9682388305664062, + "rewards/reward_fn/std": 0.17966745793819427, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 342.625, + "completions/mean_terminated_length": 342.625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.2141720589795269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.02021972427610308, + "learning_rate": 7.1928e-06, + "loss": 0.0008, + "num_tokens": 93340396.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1085.0, + "completions/mean_length": 532.6875, + "completions/mean_terminated_length": 483.8064270019531, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.2142781372653018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.019356919452548027, + "learning_rate": 7.1924e-06, + "loss": 0.2521, + "num_tokens": 93410210.0, + "reward": 2.8118836879730225, + "reward_std": 0.5149768590927124, + "rewards/reward_fn/mean": 2.8118836879730225, + "rewards/reward_fn/std": 0.5149767994880676, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1524.0, + "completions/max_terminated_length": 1524.0, + "completions/mean_length": 535.75, + "completions/mean_terminated_length": 535.75, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.2143842155510767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.02323057595640421, + "learning_rate": 7.192e-06, + "loss": -0.1021, + "num_tokens": 93456890.0, + "reward": 2.6880881786346436, + "reward_std": 0.6508941054344177, + "rewards/reward_fn/mean": 2.6880881786346436, + "rewards/reward_fn/std": 0.650894045829773, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 197.0625, + "completions/mean_terminated_length": 197.0625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2144902938368516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.02640698431059718, + "learning_rate": 7.1916e-06, + "loss": 0.0011, + "num_tokens": 93504572.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.2145963721226265, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.023409150540828705, + "learning_rate": 7.1912e-06, + "loss": 0.05, + "num_tokens": 93531216.0, + "reward": 3.9796085357666016, + "reward_std": 0.11535120010375977, + "rewards/reward_fn/mean": 3.9796085357666016, + "rewards/reward_fn/std": 0.11535115540027618, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 379.875, + "completions/mean_terminated_length": 379.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.2147024504084014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.018924297066405416, + "learning_rate": 7.1908e-06, + "loss": 0.0261, + "num_tokens": 93581708.0, + "reward": 3.8608009815216064, + "reward_std": 0.5477797389030457, + "rewards/reward_fn/mean": 3.8608009815216064, + "rewards/reward_fn/std": 0.5477797389030457, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 237.0625, + "completions/mean_terminated_length": 237.0625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.2148085286941763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.019628084031865, + "learning_rate": 7.1904e-06, + "loss": 0.0088, + "num_tokens": 93624078.0, + "reward": 3.351236581802368, + "reward_std": 0.6228786110877991, + "rewards/reward_fn/mean": 3.351236581802368, + "rewards/reward_fn/std": 0.6228786110877991, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 382.375, + "completions/mean_terminated_length": 382.375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.2149146069799512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.022442698711529374, + "learning_rate": 7.19e-06, + "loss": 0.0868, + "num_tokens": 93690490.0, + "reward": 3.7622103691101074, + "reward_std": 0.5334495902061462, + "rewards/reward_fn/mean": 3.7622103691101074, + "rewards/reward_fn/std": 0.5334495902061462, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 258.0, + "completions/mean_terminated_length": 258.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.2150206852657261, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.027412381023168564, + "learning_rate": 7.1896e-06, + "loss": -0.0866, + "num_tokens": 93729818.0, + "reward": 2.7926363945007324, + "reward_std": 0.02799339033663273, + "rewards/reward_fn/mean": 2.7926363945007324, + "rewards/reward_fn/std": 0.027993371710181236, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 241.59375, + "completions/mean_terminated_length": 241.59375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.215126763551501, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.02329367445781827, + "learning_rate": 7.189199999999999e-06, + "loss": -0.0132, + "num_tokens": 93773933.0, + "reward": 3.294184923171997, + "reward_std": 0.3811061680316925, + "rewards/reward_fn/mean": 3.294184923171997, + "rewards/reward_fn/std": 0.3811061978340149, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 280.8125, + "completions/mean_terminated_length": 280.8125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.21523284183727592, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.030496369348838925, + "learning_rate": 7.1888e-06, + "loss": 0.0776, + "num_tokens": 93813511.0, + "reward": 3.894092082977295, + "reward_std": 0.4451395869255066, + "rewards/reward_fn/mean": 3.894092082977295, + "rewards/reward_fn/std": 0.4451395571231842, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 210.0, + "completions/mean_terminated_length": 210.0, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2153389201230508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.026559143094345927, + "learning_rate": 7.1884e-06, + "loss": 0.0011, + "num_tokens": 93862471.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1131.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 317.6875, + "completions/mean_terminated_length": 317.6875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.21544499840882572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.02768648392520845, + "learning_rate": 7.188e-06, + "loss": -0.0168, + "num_tokens": 93903325.0, + "reward": 3.881580352783203, + "reward_std": 0.4127897024154663, + "rewards/reward_fn/mean": 3.881580352783203, + "rewards/reward_fn/std": 0.4127897024154663, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 175.40625, + "completions/mean_terminated_length": 175.40625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2155510766946006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.019018501159735024, + "learning_rate": 7.1876e-06, + "loss": 0.0395, + "num_tokens": 93927658.0, + "reward": 3.96970534324646, + "reward_std": 0.17137275636196136, + "rewards/reward_fn/mean": 3.96970534324646, + "rewards/reward_fn/std": 0.17137275636196136, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 173.28125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.21565715498037552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.026485492940992117, + "learning_rate": 7.1871999999999996e-06, + "loss": -0.0462, + "num_tokens": 93965459.0, + "reward": 3.959859848022461, + "reward_std": 0.227066308259964, + "rewards/reward_fn/mean": 3.959859848022461, + "rewards/reward_fn/std": 0.22706632316112518, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 440.5625, + "completions/mean_terminated_length": 440.5625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2157632332661504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.020580148906446993, + "learning_rate": 7.1867999999999995e-06, + "loss": 0.0985, + "num_tokens": 94001541.0, + "reward": 3.9648282527923584, + "reward_std": 0.1989613175392151, + "rewards/reward_fn/mean": 3.9648282527923584, + "rewards/reward_fn/std": 0.1989613175392151, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 377.375, + "completions/mean_terminated_length": 377.375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.21586931155192532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.020776059944182634, + "learning_rate": 7.1863999999999995e-06, + "loss": -0.0073, + "num_tokens": 94051185.0, + "reward": 3.9320802688598633, + "reward_std": 0.3842128813266754, + "rewards/reward_fn/mean": 3.9320802688598633, + "rewards/reward_fn/std": 0.3842128813266754, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1297.0, + "completions/max_terminated_length": 1297.0, + "completions/mean_length": 383.0625, + "completions/mean_terminated_length": 383.0625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.21597538983770023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.03056110069155693, + "learning_rate": 7.1859999999999995e-06, + "loss": -0.0284, + "num_tokens": 94094355.0, + "reward": 2.9841325283050537, + "reward_std": 0.7235788106918335, + "rewards/reward_fn/mean": 2.9841325283050537, + "rewards/reward_fn/std": 0.7235787510871887, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 234.84375, + "completions/mean_terminated_length": 234.84375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.21608146812347512, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.02781497361138463, + "learning_rate": 7.1855999999999994e-06, + "loss": 0.2453, + "num_tokens": 94125326.0, + "reward": 2.7703404426574707, + "reward_std": 0.03106667473912239, + "rewards/reward_fn/mean": 2.7703404426574707, + "rewards/reward_fn/std": 0.03106665052473545, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 303.0625, + "completions/mean_terminated_length": 303.0625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.21618754640925003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.020188409835100174, + "learning_rate": 7.185199999999999e-06, + "loss": 0.1097, + "num_tokens": 94169200.0, + "reward": 2.8137176036834717, + "reward_std": 0.2147783488035202, + "rewards/reward_fn/mean": 2.8137176036834717, + "rewards/reward_fn/std": 0.21477839350700378, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1176.0, + "completions/max_terminated_length": 1176.0, + "completions/mean_length": 250.71875, + "completions/mean_terminated_length": 250.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.21629362469502492, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.03112406632862985, + "learning_rate": 7.184799999999999e-06, + "loss": 0.0002, + "num_tokens": 94212295.0, + "reward": 3.95988130569458, + "reward_std": 0.22694644331932068, + "rewards/reward_fn/mean": 3.95988130569458, + "rewards/reward_fn/std": 0.22694644331932068, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 257.5, + "completions/mean_terminated_length": 257.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.21639970298079983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.02199523849412799, + "learning_rate": 7.1844e-06, + "loss": 0.0009, + "num_tokens": 94274263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1370.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 451.8125, + "completions/mean_terminated_length": 451.8125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.21650578126657474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.023585932329297066, + "learning_rate": 7.184e-06, + "loss": 0.1187, + "num_tokens": 94341937.0, + "reward": 2.8903353214263916, + "reward_std": 0.4277820885181427, + "rewards/reward_fn/mean": 2.8903353214263916, + "rewards/reward_fn/std": 0.4277820587158203, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 1021.90625, + "completions/mean_terminated_length": 988.806396484375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.21661185955234963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.01376344496384263, + "learning_rate": 7.1836e-06, + "loss": 0.1077, + "num_tokens": 94416718.0, + "reward": 2.3924713134765625, + "reward_std": 0.48986151814460754, + "rewards/reward_fn/mean": 2.3924713134765625, + "rewards/reward_fn/std": 0.48986148834228516, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 71.8125, + "completions/mean_terminated_length": 71.8125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.21671793783812454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.018146761576645076, + "learning_rate": 7.1832e-06, + "loss": 0.0007, + "num_tokens": 94453000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 103.5625, + "completions/mean_terminated_length": 103.5625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.21682401612389943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.024427478667348623, + "learning_rate": 7.1828e-06, + "loss": 0.001, + "num_tokens": 94487130.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1475.0, + "completions/max_terminated_length": 1475.0, + "completions/mean_length": 489.8125, + "completions/mean_terminated_length": 489.8125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.21693009440967434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.0271578470710665, + "learning_rate": 7.1824e-06, + "loss": -0.0102, + "num_tokens": 94531700.0, + "reward": 3.384610176086426, + "reward_std": 0.7715237736701965, + "rewards/reward_fn/mean": 3.384610176086426, + "rewards/reward_fn/std": 0.7715237140655518, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 382.28125, + "completions/mean_terminated_length": 382.28125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.21703617269544925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.0231085903942585, + "learning_rate": 7.182e-06, + "loss": 0.1366, + "num_tokens": 94579197.0, + "reward": 2.9709372520446777, + "reward_std": 0.45013728737831116, + "rewards/reward_fn/mean": 2.9709372520446777, + "rewards/reward_fn/std": 0.45013728737831116, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 186.71875, + "completions/mean_terminated_length": 186.71875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.21714225098122414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.02708746073767543, + "learning_rate": 7.1816e-06, + "loss": 0.0161, + "num_tokens": 94622996.0, + "reward": 3.984982967376709, + "reward_std": 0.0849492996931076, + "rewards/reward_fn/mean": 3.984982967376709, + "rewards/reward_fn/std": 0.08494929224252701, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 405.1875, + "completions/mean_terminated_length": 405.1875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.21724832926699905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.023749822983518243, + "learning_rate": 7.181199999999999e-06, + "loss": -0.0417, + "num_tokens": 94664762.0, + "reward": 3.4993603229522705, + "reward_std": 0.578482985496521, + "rewards/reward_fn/mean": 3.4993603229522705, + "rewards/reward_fn/std": 0.578482985496521, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 237.0625, + "completions/mean_terminated_length": 237.0625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.21735440755277394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.021896290825679898, + "learning_rate": 7.180799999999999e-06, + "loss": 0.0248, + "num_tokens": 94703196.0, + "reward": 3.8218743801116943, + "reward_std": 0.4205699861049652, + "rewards/reward_fn/mean": 3.8218743801116943, + "rewards/reward_fn/std": 0.4205699861049652, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 195.15625, + "completions/mean_terminated_length": 195.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.21746048583854885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.018337349290959537, + "learning_rate": 7.180399999999999e-06, + "loss": 0.0007, + "num_tokens": 94753089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 431.84375, + "completions/mean_terminated_length": 431.84375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.21756656412432376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02290130709297955, + "learning_rate": 7.179999999999999e-06, + "loss": 0.081, + "num_tokens": 94804060.0, + "reward": 3.5846269130706787, + "reward_std": 0.518695592880249, + "rewards/reward_fn/mean": 3.5846269130706787, + "rewards/reward_fn/std": 0.518695592880249, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 85.78125, + "completions/mean_terminated_length": 85.78125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.21767264241009865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.017568445531651378, + "learning_rate": 7.1796e-06, + "loss": 0.0007, + "num_tokens": 94839189.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 312.46875, + "completions/mean_terminated_length": 312.46875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.21777872069587356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.01560823933687061, + "learning_rate": 7.1792e-06, + "loss": 0.0006, + "num_tokens": 94891268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 170.53125, + "completions/mean_terminated_length": 170.53125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.21788479898164845, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.024878250900655985, + "learning_rate": 7.1788e-06, + "loss": 0.0492, + "num_tokens": 94935125.0, + "reward": 2.791635513305664, + "reward_std": 0.030824407935142517, + "rewards/reward_fn/mean": 2.791635513305664, + "rewards/reward_fn/std": 0.03082440234720707, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 110.625, + "completions/mean_terminated_length": 110.625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.21799087726742336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2431640625, + "kl": 0.0337254130281508, + "learning_rate": 7.1784e-06, + "loss": 0.0013, + "num_tokens": 94962089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 230.375, + "completions/mean_terminated_length": 230.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.21809695555319827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.016936297761276364, + "learning_rate": 7.178e-06, + "loss": 0.0007, + "num_tokens": 95006325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 189.65625, + "completions/mean_terminated_length": 189.65625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.21820303383897316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.017722343327477574, + "learning_rate": 7.1776e-06, + "loss": -0.0566, + "num_tokens": 95052170.0, + "reward": 3.965939998626709, + "reward_std": 0.19267311692237854, + "rewards/reward_fn/mean": 3.965939998626709, + "rewards/reward_fn/std": 0.19267308712005615, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 290.84375, + "completions/mean_terminated_length": 290.84375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.21830911212474807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.021622674306854606, + "learning_rate": 7.1772e-06, + "loss": -0.0281, + "num_tokens": 95087685.0, + "reward": 3.966918468475342, + "reward_std": 0.18713752925395966, + "rewards/reward_fn/mean": 3.966918468475342, + "rewards/reward_fn/std": 0.18713752925395966, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.21841519041052296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.028689004946500063, + "learning_rate": 7.1768e-06, + "loss": -0.0264, + "num_tokens": 95133725.0, + "reward": 3.931462287902832, + "reward_std": 0.27030280232429504, + "rewards/reward_fn/mean": 3.931462287902832, + "rewards/reward_fn/std": 0.27030277252197266, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 359.40625, + "completions/mean_terminated_length": 359.40625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.21852126869629787, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.02633265615440905, + "learning_rate": 7.1764e-06, + "loss": -0.0755, + "num_tokens": 95182442.0, + "reward": 2.791104793548584, + "reward_std": 0.3357614576816559, + "rewards/reward_fn/mean": 2.791104793548584, + "rewards/reward_fn/std": 0.3357614576816559, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 236.09375, + "completions/mean_terminated_length": 236.09375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.21862734698207276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.022103465860709548, + "learning_rate": 7.1759999999999996e-06, + "loss": 0.0616, + "num_tokens": 95224621.0, + "reward": 3.220750331878662, + "reward_std": 0.6474551558494568, + "rewards/reward_fn/mean": 3.220750331878662, + "rewards/reward_fn/std": 0.6474552154541016, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 473.78125, + "completions/mean_terminated_length": 423.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.21873342526784767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.02473957440815866, + "learning_rate": 7.1755999999999995e-06, + "loss": 0.1878, + "num_tokens": 95274918.0, + "reward": 2.2360482215881348, + "reward_std": 0.6540963649749756, + "rewards/reward_fn/mean": 2.2360482215881348, + "rewards/reward_fn/std": 0.6540964245796204, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 770.78125, + "completions/mean_terminated_length": 685.6333618164062, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.21883950355362258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.014348611701279879, + "learning_rate": 7.1751999999999995e-06, + "loss": 0.0931, + "num_tokens": 95336959.0, + "reward": 2.4014182090759277, + "reward_std": 0.6035107970237732, + "rewards/reward_fn/mean": 2.4014182090759277, + "rewards/reward_fn/std": 0.6035107970237732, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 306.75, + "completions/mean_terminated_length": 306.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.21894558183939747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02311077411286533, + "learning_rate": 7.1748e-06, + "loss": -0.0083, + "num_tokens": 95382743.0, + "reward": 3.9660825729370117, + "reward_std": 0.1918664574623108, + "rewards/reward_fn/mean": 3.9660825729370117, + "rewards/reward_fn/std": 0.1918664425611496, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 231.59375, + "completions/mean_terminated_length": 231.59375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.21905166012517238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.022033887798897922, + "learning_rate": 7.1744e-06, + "loss": -0.0387, + "num_tokens": 95426218.0, + "reward": 3.723904609680176, + "reward_std": 0.6886054873466492, + "rewards/reward_fn/mean": 3.723904609680176, + "rewards/reward_fn/std": 0.6886054277420044, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 185.28125, + "completions/mean_terminated_length": 185.28125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.21915773841094727, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.01918771117925644, + "learning_rate": 7.174e-06, + "loss": 0.0791, + "num_tokens": 95467795.0, + "reward": 3.837489128112793, + "reward_std": 0.3440697491168976, + "rewards/reward_fn/mean": 3.837489128112793, + "rewards/reward_fn/std": 0.34406977891921997, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 264.125, + "completions/mean_terminated_length": 264.125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.21926381669672218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0323160660918802, + "learning_rate": 7.1736e-06, + "loss": 0.1329, + "num_tokens": 95510327.0, + "reward": 3.0511176586151123, + "reward_std": 0.36487215757369995, + "rewards/reward_fn/mean": 3.0511176586151123, + "rewards/reward_fn/std": 0.36487212777137756, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 561.3125, + "completions/mean_terminated_length": 513.3547973632812, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.2193698949824971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.017476799665018916, + "learning_rate": 7.173199999999999e-06, + "loss": 0.2184, + "num_tokens": 95564385.0, + "reward": 2.6805789470672607, + "reward_std": 0.25595468282699585, + "rewards/reward_fn/mean": 2.6805789470672607, + "rewards/reward_fn/std": 0.25595468282699585, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.21947597326827198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.03495682845823467, + "learning_rate": 7.172799999999999e-06, + "loss": 0.0014, + "num_tokens": 95619082.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 180.8125, + "completions/mean_terminated_length": 180.8125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.2195820515540469, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.016302260337397456, + "learning_rate": 7.172399999999999e-06, + "loss": -0.0412, + "num_tokens": 95660580.0, + "reward": 3.9653358459472656, + "reward_std": 0.19608987867832184, + "rewards/reward_fn/mean": 3.9653358459472656, + "rewards/reward_fn/std": 0.19608986377716064, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 289.3125, + "completions/mean_terminated_length": 289.3125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.21968812983982178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.02007239032536745, + "learning_rate": 7.171999999999999e-06, + "loss": 0.0008, + "num_tokens": 95722414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.2197942081255967, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.027552237967029214, + "learning_rate": 7.171599999999999e-06, + "loss": -0.0198, + "num_tokens": 95752738.0, + "reward": 2.9689650535583496, + "reward_std": 0.0429152250289917, + "rewards/reward_fn/mean": 2.9689650535583496, + "rewards/reward_fn/std": 0.04291524365544319, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 167.78125, + "completions/mean_terminated_length": 167.78125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.2199002864113716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.022559367353096604, + "learning_rate": 7.171199999999999e-06, + "loss": 0.0009, + "num_tokens": 95815675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 212.5625, + "completions/mean_terminated_length": 212.5625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2200063646971465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.010360506421420723, + "learning_rate": 7.170799999999999e-06, + "loss": -0.0001, + "num_tokens": 95860493.0, + "reward": 3.929622173309326, + "reward_std": 0.3981178402900696, + "rewards/reward_fn/mean": 3.929622173309326, + "rewards/reward_fn/std": 0.3981178402900696, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 206.3125, + "completions/mean_terminated_length": 206.3125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2201124429829214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.01717807969544083, + "learning_rate": 7.170399999999999e-06, + "loss": 0.0007, + "num_tokens": 95918807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 389.65625, + "completions/mean_terminated_length": 389.65625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2202185212686963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.02313116961158812, + "learning_rate": 7.17e-06, + "loss": 0.0968, + "num_tokens": 95967660.0, + "reward": 3.746159553527832, + "reward_std": 0.6335774660110474, + "rewards/reward_fn/mean": 3.746159553527832, + "rewards/reward_fn/std": 0.6335774064064026, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 192.6875, + "completions/mean_terminated_length": 192.6875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.2203245995544712, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.029190136585384607, + "learning_rate": 7.1696e-06, + "loss": 0.0691, + "num_tokens": 96007010.0, + "reward": 3.521986484527588, + "reward_std": 0.6271064877510071, + "rewards/reward_fn/mean": 3.521986484527588, + "rewards/reward_fn/std": 0.6271064877510071, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 256.9375, + "completions/mean_terminated_length": 256.9375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.22043067784024611, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.031511508859694004, + "learning_rate": 7.1692e-06, + "loss": -0.0353, + "num_tokens": 96050592.0, + "reward": 3.8938238620758057, + "reward_std": 0.43997421860694885, + "rewards/reward_fn/mean": 3.8938238620758057, + "rewards/reward_fn/std": 0.43997427821159363, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 457.96875, + "completions/mean_terminated_length": 457.96875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.220536756126021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.01874492526985705, + "learning_rate": 7.1688e-06, + "loss": 0.0453, + "num_tokens": 96117151.0, + "reward": 3.6167984008789062, + "reward_std": 0.5801927447319031, + "rewards/reward_fn/mean": 3.6167984008789062, + "rewards/reward_fn/std": 0.5801927447319031, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.2206428344117959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.015837123501114547, + "learning_rate": 7.1684e-06, + "loss": 0.0119, + "num_tokens": 96158739.0, + "reward": 3.959514617919922, + "reward_std": 0.22902005910873413, + "rewards/reward_fn/mean": 3.959514617919922, + "rewards/reward_fn/std": 0.22902007400989532, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 333.1875, + "completions/mean_terminated_length": 333.1875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.2207489126975708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.020457167527638376, + "learning_rate": 7.168e-06, + "loss": 0.0008, + "num_tokens": 96208665.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 225.65625, + "completions/mean_terminated_length": 225.65625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.2208549909833457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.024195005418732762, + "learning_rate": 7.1676e-06, + "loss": -0.1019, + "num_tokens": 96244942.0, + "reward": 3.6277449131011963, + "reward_std": 0.4575171172618866, + "rewards/reward_fn/mean": 3.6277449131011963, + "rewards/reward_fn/std": 0.4575170576572418, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 251.9375, + "completions/mean_terminated_length": 251.9375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.22096106926912062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.02725498448126018, + "learning_rate": 7.1672e-06, + "loss": 0.0831, + "num_tokens": 96268908.0, + "reward": 3.8278117179870605, + "reward_std": 0.5053116083145142, + "rewards/reward_fn/mean": 3.8278117179870605, + "rewards/reward_fn/std": 0.5053115487098694, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 206.4375, + "completions/mean_terminated_length": 206.4375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2210671475548955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01674318127334118, + "learning_rate": 7.1668e-06, + "loss": -0.03, + "num_tokens": 96305498.0, + "reward": 2.9250006675720215, + "reward_std": 0.04684029147028923, + "rewards/reward_fn/mean": 2.9250006675720215, + "rewards/reward_fn/std": 0.04684024676680565, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 122.78125, + "completions/mean_terminated_length": 122.78125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.22117322584067042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.026960970601066947, + "learning_rate": 7.1664e-06, + "loss": 0.0011, + "num_tokens": 96357139.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 359.6875, + "completions/mean_terminated_length": 359.6875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2212793041264453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.021832972299307585, + "learning_rate": 7.166e-06, + "loss": 0.0165, + "num_tokens": 96402345.0, + "reward": 2.866878032684326, + "reward_std": 0.3685334324836731, + "rewards/reward_fn/mean": 2.866878032684326, + "rewards/reward_fn/std": 0.3685334324836731, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 120.90625, + "completions/mean_terminated_length": 120.90625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.22138538241222022, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.022863602731376886, + "learning_rate": 7.1656000000000005e-06, + "loss": 0.1065, + "num_tokens": 96439302.0, + "reward": 2.8354334831237793, + "reward_std": 0.03567254915833473, + "rewards/reward_fn/mean": 2.8354334831237793, + "rewards/reward_fn/std": 0.035672519356012344, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 245.25, + "completions/mean_terminated_length": 245.25, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.2214914606979951, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.02509408933110535, + "learning_rate": 7.1652e-06, + "loss": 0.093, + "num_tokens": 96492078.0, + "reward": 3.8152360916137695, + "reward_std": 0.3649226725101471, + "rewards/reward_fn/mean": 3.8152360916137695, + "rewards/reward_fn/std": 0.3649226725101471, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 195.0, + "completions/mean_terminated_length": 195.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.22159753898377002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.020710675860755146, + "learning_rate": 7.1647999999999996e-06, + "loss": -0.0253, + "num_tokens": 96516622.0, + "reward": 3.013934850692749, + "reward_std": 0.325973778963089, + "rewards/reward_fn/mean": 3.013934850692749, + "rewards/reward_fn/std": 0.325973778963089, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 322.78125, + "completions/mean_terminated_length": 322.78125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.22170361726954493, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.01796165155246854, + "learning_rate": 7.1643999999999995e-06, + "loss": 0.1965, + "num_tokens": 96562535.0, + "reward": 3.9826741218566895, + "reward_std": 0.0980101004242897, + "rewards/reward_fn/mean": 3.9826741218566895, + "rewards/reward_fn/std": 0.0980100929737091, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.22180969555531982, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.0245153047144413, + "learning_rate": 7.1639999999999995e-06, + "loss": 0.0613, + "num_tokens": 96599823.0, + "reward": 3.8469858169555664, + "reward_std": 0.41139575839042664, + "rewards/reward_fn/mean": 3.8469858169555664, + "rewards/reward_fn/std": 0.4113958179950714, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 126.0625, + "completions/mean_terminated_length": 126.0625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.22191577384109473, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.030944793485105038, + "learning_rate": 7.1635999999999995e-06, + "loss": 0.1047, + "num_tokens": 96639345.0, + "reward": 2.86065936088562, + "reward_std": 0.05229390040040016, + "rewards/reward_fn/mean": 2.86065936088562, + "rewards/reward_fn/std": 0.05229390785098076, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 242.9375, + "completions/mean_terminated_length": 242.9375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.22202185212686962, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.024946002522483468, + "learning_rate": 7.1631999999999995e-06, + "loss": 0.0677, + "num_tokens": 96661999.0, + "reward": 3.1001062393188477, + "reward_std": 0.3169058561325073, + "rewards/reward_fn/mean": 3.1001062393188477, + "rewards/reward_fn/std": 0.31690582633018494, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 145.46875, + "completions/mean_terminated_length": 145.46875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.22212793041264453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.03213479835540056, + "learning_rate": 7.162799999999999e-06, + "loss": 0.0013, + "num_tokens": 96683358.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 239.59375, + "completions/mean_terminated_length": 239.59375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.22223400869841944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.014876898960210383, + "learning_rate": 7.162399999999999e-06, + "loss": 0.0037, + "num_tokens": 96730929.0, + "reward": 3.8969430923461914, + "reward_std": 0.4309411346912384, + "rewards/reward_fn/mean": 3.8969430923461914, + "rewards/reward_fn/std": 0.430941104888916, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 305.5625, + "completions/mean_terminated_length": 305.5625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.22234008698419433, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.01991373603232205, + "learning_rate": 7.161999999999999e-06, + "loss": 0.0078, + "num_tokens": 96776515.0, + "reward": 3.0914957523345947, + "reward_std": 0.5341657996177673, + "rewards/reward_fn/mean": 3.0914957523345947, + "rewards/reward_fn/std": 0.5341657996177673, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 309.96875, + "completions/mean_terminated_length": 309.96875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.22244616526996924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.022031590808182955, + "learning_rate": 7.161599999999999e-06, + "loss": 0.0249, + "num_tokens": 96829954.0, + "reward": 3.9301846027374268, + "reward_std": 0.3949355185031891, + "rewards/reward_fn/mean": 3.9301846027374268, + "rewards/reward_fn/std": 0.3949355185031891, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1887.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 511.59375, + "completions/mean_terminated_length": 511.59375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.22255224355574413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.023883348098024726, + "learning_rate": 7.161199999999999e-06, + "loss": 0.0523, + "num_tokens": 96860949.0, + "reward": 2.677259922027588, + "reward_std": 0.08000855147838593, + "rewards/reward_fn/mean": 2.677259922027588, + "rewards/reward_fn/std": 0.08000854402780533, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1283.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 352.71875, + "completions/mean_terminated_length": 352.71875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.22265832184151904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.023541218601167202, + "learning_rate": 7.1608e-06, + "loss": -0.0193, + "num_tokens": 96929548.0, + "reward": 3.930210590362549, + "reward_std": 0.27486422657966614, + "rewards/reward_fn/mean": 3.930210590362549, + "rewards/reward_fn/std": 0.27486422657966614, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 254.5625, + "completions/mean_terminated_length": 254.5625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.22276440012729395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.028475441271439195, + "learning_rate": 7.1604e-06, + "loss": -0.0486, + "num_tokens": 96967934.0, + "reward": 3.4316561222076416, + "reward_std": 0.6160090565681458, + "rewards/reward_fn/mean": 3.4316561222076416, + "rewards/reward_fn/std": 0.6160091161727905, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 158.09375, + "completions/mean_terminated_length": 158.09375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.22287047841306884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.01883817312773317, + "learning_rate": 7.16e-06, + "loss": 0.0008, + "num_tokens": 97000769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 325.4375, + "completions/mean_terminated_length": 325.4375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.22297655669884375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.02302585169672966, + "learning_rate": 7.1596e-06, + "loss": 0.0672, + "num_tokens": 97046383.0, + "reward": 3.923642158508301, + "reward_std": 0.3004699647426605, + "rewards/reward_fn/mean": 3.923642158508301, + "rewards/reward_fn/std": 0.3004699647426605, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1747.0, + "completions/max_terminated_length": 1747.0, + "completions/mean_length": 518.75, + "completions/mean_terminated_length": 518.75, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.22308263498461864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.020775122102349997, + "learning_rate": 7.1592e-06, + "loss": -0.0671, + "num_tokens": 97085383.0, + "reward": 3.2577528953552246, + "reward_std": 0.5909585356712341, + "rewards/reward_fn/mean": 3.2577528953552246, + "rewards/reward_fn/std": 0.5909585356712341, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 133.34375, + "completions/mean_terminated_length": 133.34375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.22318871327039355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.021738857380114496, + "learning_rate": 7.1588e-06, + "loss": 0.0009, + "num_tokens": 97141842.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 170.125, + "completions/mean_terminated_length": 170.125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.22329479155616846, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.033207230269908905, + "learning_rate": 7.1584e-06, + "loss": -0.0047, + "num_tokens": 97176086.0, + "reward": 2.8362083435058594, + "reward_std": 0.036612384021282196, + "rewards/reward_fn/mean": 2.8362083435058594, + "rewards/reward_fn/std": 0.03661240264773369, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1838.0, + "completions/max_terminated_length": 1838.0, + "completions/mean_length": 596.125, + "completions/mean_terminated_length": 596.125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.22340086984194335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.018431765493005514, + "learning_rate": 7.158e-06, + "loss": 0.1009, + "num_tokens": 97241338.0, + "reward": 2.9607906341552734, + "reward_std": 0.35934340953826904, + "rewards/reward_fn/mean": 2.9607906341552734, + "rewards/reward_fn/std": 0.35934343934059143, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1573.0, + "completions/mean_length": 785.53125, + "completions/mean_terminated_length": 744.8064575195312, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.22350694812771826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.01531498518306762, + "learning_rate": 7.1576e-06, + "loss": 0.1756, + "num_tokens": 97316107.0, + "reward": 3.421172618865967, + "reward_std": 0.9520739316940308, + "rewards/reward_fn/mean": 3.421172618865967, + "rewards/reward_fn/std": 0.9520739316940308, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 183.09375, + "completions/mean_terminated_length": 183.09375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.22361302641349315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.019659267854876816, + "learning_rate": 7.157199999999999e-06, + "loss": 0.0008, + "num_tokens": 97373774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 378.09375, + "completions/mean_terminated_length": 378.09375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.22371910469926806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02354878233745694, + "learning_rate": 7.156799999999999e-06, + "loss": -0.1435, + "num_tokens": 97416305.0, + "reward": 3.2668161392211914, + "reward_std": 0.6438294053077698, + "rewards/reward_fn/mean": 3.2668161392211914, + "rewards/reward_fn/std": 0.6438294053077698, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1249.0, + "completions/max_terminated_length": 1249.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.22382518298504298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.023293010191991925, + "learning_rate": 7.156399999999999e-06, + "loss": -0.08, + "num_tokens": 97471517.0, + "reward": 3.9675302505493164, + "reward_std": 0.18367597460746765, + "rewards/reward_fn/mean": 3.9675302505493164, + "rewards/reward_fn/std": 0.18367597460746765, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 250.875, + "completions/mean_terminated_length": 250.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.22393126127081786, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.0269605559296906, + "learning_rate": 7.156e-06, + "loss": 0.0463, + "num_tokens": 97513113.0, + "reward": 3.773740291595459, + "reward_std": 0.4349633753299713, + "rewards/reward_fn/mean": 3.773740291595459, + "rewards/reward_fn/std": 0.4349633753299713, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1635.0, + "completions/max_terminated_length": 1635.0, + "completions/mean_length": 382.4375, + "completions/mean_terminated_length": 382.4375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.22403733955659277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.030054514296352863, + "learning_rate": 7.1556e-06, + "loss": 0.092, + "num_tokens": 97554151.0, + "reward": 3.541625499725342, + "reward_std": 0.634077250957489, + "rewards/reward_fn/mean": 3.541625499725342, + "rewards/reward_fn/std": 0.634077250957489, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 433.3125, + "completions/mean_terminated_length": 381.2257995605469, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.22414341784236766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.019980568438768387, + "learning_rate": 7.1552e-06, + "loss": 0.2043, + "num_tokens": 97603313.0, + "reward": 2.8600192070007324, + "reward_std": 0.5243455171585083, + "rewards/reward_fn/mean": 2.8600192070007324, + "rewards/reward_fn/std": 0.5243453979492188, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 422.5, + "completions/mean_terminated_length": 422.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.22424949612814257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.023464223137125373, + "learning_rate": 7.1548e-06, + "loss": -0.0462, + "num_tokens": 97652833.0, + "reward": 2.905588150024414, + "reward_std": 0.43429967761039734, + "rewards/reward_fn/mean": 2.905588150024414, + "rewards/reward_fn/std": 0.43429967761039734, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 110.65625, + "completions/mean_terminated_length": 110.65625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.22435557441391746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.016824107500724494, + "learning_rate": 7.1544e-06, + "loss": 0.0007, + "num_tokens": 97702038.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 490.8125, + "completions/mean_terminated_length": 490.8125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.22446165269969237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.019860751694068313, + "learning_rate": 7.154e-06, + "loss": 0.1159, + "num_tokens": 97761872.0, + "reward": 2.950417995452881, + "reward_std": 0.4983111023902893, + "rewards/reward_fn/mean": 2.950417995452881, + "rewards/reward_fn/std": 0.4983111023902893, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 337.09375, + "completions/mean_terminated_length": 337.09375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.22456773098546728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.023477735929191113, + "learning_rate": 7.1535999999999996e-06, + "loss": -0.0293, + "num_tokens": 97814899.0, + "reward": 2.9934606552124023, + "reward_std": 0.7055239677429199, + "rewards/reward_fn/mean": 2.9934606552124023, + "rewards/reward_fn/std": 0.7055239081382751, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 79.625, + "completions/mean_terminated_length": 79.625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.22467380927124217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.016976277576759458, + "learning_rate": 7.1531999999999995e-06, + "loss": 0.0007, + "num_tokens": 97852359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 207.28125, + "completions/mean_terminated_length": 207.28125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.22477988755701708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.024817738914862275, + "learning_rate": 7.1527999999999995e-06, + "loss": 0.001, + "num_tokens": 97893552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 339.375, + "completions/mean_terminated_length": 339.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.22488596584279197, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.02061704732477665, + "learning_rate": 7.1523999999999995e-06, + "loss": 0.0577, + "num_tokens": 97941532.0, + "reward": 3.6477723121643066, + "reward_std": 0.603849470615387, + "rewards/reward_fn/mean": 3.6477723121643066, + "rewards/reward_fn/std": 0.603849470615387, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 389.875, + "completions/mean_terminated_length": 389.875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.22499204412856688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.016808875952847302, + "learning_rate": 7.1519999999999995e-06, + "loss": 0.0569, + "num_tokens": 97989560.0, + "reward": 3.6947927474975586, + "reward_std": 0.5374994874000549, + "rewards/reward_fn/mean": 3.6947927474975586, + "rewards/reward_fn/std": 0.5374994277954102, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 301.03125, + "completions/mean_terminated_length": 301.03125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.2250981224143418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.017114053131081164, + "learning_rate": 7.1516e-06, + "loss": -0.0851, + "num_tokens": 98029145.0, + "reward": 3.0186710357666016, + "reward_std": 0.19123917818069458, + "rewards/reward_fn/mean": 3.0186710357666016, + "rewards/reward_fn/std": 0.1912391483783722, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 243.0, + "completions/mean_terminated_length": 243.0, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.22520420070011668, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.02709133573807776, + "learning_rate": 7.1512e-06, + "loss": 0.06, + "num_tokens": 98067161.0, + "reward": 3.966747760772705, + "reward_std": 0.18810324370861053, + "rewards/reward_fn/mean": 3.966747760772705, + "rewards/reward_fn/std": 0.18810328841209412, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 588.46875, + "completions/mean_terminated_length": 588.46875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.2253102789858916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.02302650804631412, + "learning_rate": 7.1508e-06, + "loss": 0.1045, + "num_tokens": 98148168.0, + "reward": 2.512251853942871, + "reward_std": 0.5429190397262573, + "rewards/reward_fn/mean": 2.512251853942871, + "rewards/reward_fn/std": 0.5429189801216125, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 286.03125, + "completions/mean_terminated_length": 286.03125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.22541635727166648, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.024617045186460018, + "learning_rate": 7.1504e-06, + "loss": 0.1288, + "num_tokens": 98196681.0, + "reward": 3.893162727355957, + "reward_std": 0.3376566171646118, + "rewards/reward_fn/mean": 3.893162727355957, + "rewards/reward_fn/std": 0.3376566171646118, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 219.75, + "completions/mean_terminated_length": 219.75, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2255224355574414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.0329362649936229, + "learning_rate": 7.15e-06, + "loss": 0.0216, + "num_tokens": 98247457.0, + "reward": 2.987175703048706, + "reward_std": 0.47789204120635986, + "rewards/reward_fn/mean": 2.987175703048706, + "rewards/reward_fn/std": 0.47789207100868225, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 213.40625, + "completions/mean_terminated_length": 213.40625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2256285138432163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.011680109717417508, + "learning_rate": 7.1496e-06, + "loss": 0.0005, + "num_tokens": 98285070.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 163.03125, + "completions/mean_terminated_length": 163.03125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.2257345921289912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.015980440541170537, + "learning_rate": 7.1492e-06, + "loss": 0.0006, + "num_tokens": 98311599.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.0, + "completions/max_terminated_length": 1539.0, + "completions/mean_length": 355.4375, + "completions/mean_terminated_length": 355.4375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.2258406704147661, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.023469509556889534, + "learning_rate": 7.148799999999999e-06, + "loss": -0.051, + "num_tokens": 98361501.0, + "reward": 3.1205215454101562, + "reward_std": 0.9047970175743103, + "rewards/reward_fn/mean": 3.1205215454101562, + "rewards/reward_fn/std": 0.9047970771789551, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1518.0, + "completions/max_terminated_length": 1518.0, + "completions/mean_length": 397.71875, + "completions/mean_terminated_length": 397.71875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.225946748700541, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.019150591921061277, + "learning_rate": 7.148399999999999e-06, + "loss": -0.0415, + "num_tokens": 98412372.0, + "reward": 2.7703003883361816, + "reward_std": 0.2596660852432251, + "rewards/reward_fn/mean": 2.7703003883361816, + "rewards/reward_fn/std": 0.2596660554409027, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1542.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 377.90625, + "completions/mean_terminated_length": 377.90625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2260528269863159, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.022556321462616324, + "learning_rate": 7.147999999999999e-06, + "loss": 0.0785, + "num_tokens": 98456241.0, + "reward": 3.0584633350372314, + "reward_std": 0.5075932145118713, + "rewards/reward_fn/mean": 3.0584633350372314, + "rewards/reward_fn/std": 0.5075931549072266, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.22615890527209082, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.390625, + "kl": 0.01705359760671854, + "learning_rate": 7.147599999999999e-06, + "loss": 0.1622, + "num_tokens": 98508533.0, + "reward": 3.8904531002044678, + "reward_std": 0.34638121724128723, + "rewards/reward_fn/mean": 3.8904531002044678, + "rewards/reward_fn/std": 0.34638121724128723, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 332.03125, + "completions/mean_terminated_length": 332.03125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.2262649835578657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.023051244905218482, + "learning_rate": 7.147199999999999e-06, + "loss": 0.0897, + "num_tokens": 98553238.0, + "reward": 3.1788926124572754, + "reward_std": 0.6480764746665955, + "rewards/reward_fn/mean": 3.1788926124572754, + "rewards/reward_fn/std": 0.6480764746665955, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 230.8125, + "completions/mean_terminated_length": 230.8125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.22637106184364061, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.01907302311155945, + "learning_rate": 7.1468e-06, + "loss": -0.031, + "num_tokens": 98602416.0, + "reward": 3.8189072608947754, + "reward_std": 0.5166525840759277, + "rewards/reward_fn/mean": 3.8189072608947754, + "rewards/reward_fn/std": 0.516652524471283, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2044.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 471.53125, + "completions/mean_terminated_length": 471.53125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.2264771401294155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.016809441964142025, + "learning_rate": 7.1464e-06, + "loss": -0.1404, + "num_tokens": 98653633.0, + "reward": 3.647747039794922, + "reward_std": 0.5732264518737793, + "rewards/reward_fn/mean": 3.647747039794922, + "rewards/reward_fn/std": 0.5732264518737793, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 260.28125, + "completions/mean_terminated_length": 260.28125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2265832184151904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.026241691317409277, + "learning_rate": 7.146e-06, + "loss": 0.0298, + "num_tokens": 98678858.0, + "reward": 3.641641616821289, + "reward_std": 0.47242069244384766, + "rewards/reward_fn/mean": 3.641641616821289, + "rewards/reward_fn/std": 0.47242069244384766, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 695.40625, + "completions/mean_terminated_length": 605.2333374023438, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.22668929670096533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.018082533963024616, + "learning_rate": 7.1456e-06, + "loss": 0.1919, + "num_tokens": 98741495.0, + "reward": 2.9328463077545166, + "reward_std": 0.5077850222587585, + "rewards/reward_fn/mean": 2.9328463077545166, + "rewards/reward_fn/std": 0.5077849626541138, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 345.21875, + "completions/mean_terminated_length": 345.21875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.2267953749867402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.020768756279721856, + "learning_rate": 7.1452e-06, + "loss": 0.1444, + "num_tokens": 98782814.0, + "reward": 2.7692275047302246, + "reward_std": 0.04734927415847778, + "rewards/reward_fn/mean": 2.7692275047302246, + "rewards/reward_fn/std": 0.047349270433187485, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 259.0, + "completions/mean_terminated_length": 259.0, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.22690145327251512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.025864448165521026, + "learning_rate": 7.1448e-06, + "loss": 0.1333, + "num_tokens": 98819966.0, + "reward": 2.8633437156677246, + "reward_std": 0.04574419930577278, + "rewards/reward_fn/mean": 2.8633437156677246, + "rewards/reward_fn/std": 0.04574418067932129, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 105.78125, + "completions/mean_terminated_length": 105.78125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.22700753155829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.018404418835416436, + "learning_rate": 7.1444e-06, + "loss": 0.0007, + "num_tokens": 98846903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 285.21875, + "completions/mean_terminated_length": 285.21875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.22711360984406492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.017287316382862628, + "learning_rate": 7.144e-06, + "loss": -0.0023, + "num_tokens": 98902142.0, + "reward": 2.7961578369140625, + "reward_std": 0.4452827572822571, + "rewards/reward_fn/mean": 2.7961578369140625, + "rewards/reward_fn/std": 0.4452826976776123, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 233.21875, + "completions/mean_terminated_length": 233.21875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.2272196881298398, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.025570286670699716, + "learning_rate": 7.1436e-06, + "loss": 0.2669, + "num_tokens": 98946053.0, + "reward": 3.322685956954956, + "reward_std": 0.10418742150068283, + "rewards/reward_fn/mean": 3.322685956954956, + "rewards/reward_fn/std": 0.10418742895126343, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 340.65625, + "completions/mean_terminated_length": 340.65625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.22732576641561472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.01869728649035096, + "learning_rate": 7.1432e-06, + "loss": 0.0007, + "num_tokens": 98999642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 447.5, + "completions/mean_terminated_length": 447.5, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.22743184470138963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.020166749833151698, + "learning_rate": 7.1428e-06, + "loss": 0.0329, + "num_tokens": 99053162.0, + "reward": 2.868180274963379, + "reward_std": 0.04937407374382019, + "rewards/reward_fn/mean": 2.868180274963379, + "rewards/reward_fn/std": 0.049374066293239594, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 207.625, + "completions/mean_terminated_length": 207.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.22753792298716452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.016309529077261686, + "learning_rate": 7.1424e-06, + "loss": 0.0007, + "num_tokens": 99096958.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 257.5625, + "completions/mean_terminated_length": 257.5625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.22764400127293943, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02131958887912333, + "learning_rate": 7.142e-06, + "loss": -0.0234, + "num_tokens": 99143504.0, + "reward": 3.061922788619995, + "reward_std": 0.41329920291900635, + "rewards/reward_fn/mean": 3.061922788619995, + "rewards/reward_fn/std": 0.41329917311668396, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 179.21875, + "completions/mean_terminated_length": 179.21875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.22775007955871432, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.03266157838515937, + "learning_rate": 7.1416e-06, + "loss": 0.1967, + "num_tokens": 99180919.0, + "reward": 2.9364829063415527, + "reward_std": 0.05752362683415413, + "rewards/reward_fn/mean": 2.9364829063415527, + "rewards/reward_fn/std": 0.05752362310886383, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 163.40625, + "completions/mean_terminated_length": 163.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.22785615784448923, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.02107630728278309, + "learning_rate": 7.1412e-06, + "loss": 0.0386, + "num_tokens": 99222948.0, + "reward": 3.933260440826416, + "reward_std": 0.26269760727882385, + "rewards/reward_fn/mean": 3.933260440826416, + "rewards/reward_fn/std": 0.26269757747650146, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 490.1875, + "completions/mean_terminated_length": 490.1875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.22796223613026415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.020313676446676254, + "learning_rate": 7.1407999999999995e-06, + "loss": 0.0864, + "num_tokens": 99298218.0, + "reward": 2.649317979812622, + "reward_std": 0.5141263604164124, + "rewards/reward_fn/mean": 2.649317979812622, + "rewards/reward_fn/std": 0.5141263604164124, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 312.34375, + "completions/mean_terminated_length": 312.34375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.22806831441603903, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.02623810595832765, + "learning_rate": 7.1403999999999994e-06, + "loss": -0.079, + "num_tokens": 99346709.0, + "reward": 3.2548115253448486, + "reward_std": 0.5868596434593201, + "rewards/reward_fn/mean": 3.2548115253448486, + "rewards/reward_fn/std": 0.5868596434593201, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 285.4375, + "completions/mean_terminated_length": 285.4375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.22817439270181394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.027033630991354585, + "learning_rate": 7.139999999999999e-06, + "loss": 0.1689, + "num_tokens": 99391395.0, + "reward": 3.732773780822754, + "reward_std": 0.5501604676246643, + "rewards/reward_fn/mean": 3.732773780822754, + "rewards/reward_fn/std": 0.5501604676246643, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 210.59375, + "completions/mean_terminated_length": 210.59375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.22828047098758883, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01814991922583431, + "learning_rate": 7.139599999999999e-06, + "loss": -0.0006, + "num_tokens": 99410710.0, + "reward": 2.9454665184020996, + "reward_std": 0.04778565838932991, + "rewards/reward_fn/mean": 2.9454665184020996, + "rewards/reward_fn/std": 0.04778566583991051, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 308.875, + "completions/mean_terminated_length": 308.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.22838654927336374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.024135290645062923, + "learning_rate": 7.139199999999999e-06, + "loss": 0.001, + "num_tokens": 99458034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 292.5625, + "completions/mean_terminated_length": 292.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.22849262755913866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.03052612068131566, + "learning_rate": 7.138799999999999e-06, + "loss": -0.0475, + "num_tokens": 99498948.0, + "reward": 3.8826375007629395, + "reward_std": 0.3714655637741089, + "rewards/reward_fn/mean": 3.8826375007629395, + "rewards/reward_fn/std": 0.3714655637741089, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 348.375, + "completions/mean_terminated_length": 348.375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.22859870584491354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.02576355915516615, + "learning_rate": 7.138399999999999e-06, + "loss": -0.0272, + "num_tokens": 99553680.0, + "reward": 3.7404799461364746, + "reward_std": 0.5951921939849854, + "rewards/reward_fn/mean": 3.7404799461364746, + "rewards/reward_fn/std": 0.5951921343803406, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 297.3125, + "completions/mean_terminated_length": 297.3125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.22870478413068845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.02024422027170658, + "learning_rate": 7.137999999999999e-06, + "loss": 0.0165, + "num_tokens": 99598522.0, + "reward": 2.861987352371216, + "reward_std": 0.38427045941352844, + "rewards/reward_fn/mean": 2.861987352371216, + "rewards/reward_fn/std": 0.38427045941352844, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 166.09375, + "completions/mean_terminated_length": 166.09375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.22881086241646334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.02221654006280005, + "learning_rate": 7.137599999999999e-06, + "loss": -0.0167, + "num_tokens": 99634173.0, + "reward": 3.8790369033813477, + "reward_std": 0.28946876525878906, + "rewards/reward_fn/mean": 3.8790369033813477, + "rewards/reward_fn/std": 0.28946876525878906, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1931.0, + "completions/max_terminated_length": 1931.0, + "completions/mean_length": 596.8125, + "completions/mean_terminated_length": 596.8125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.22891694070223825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.022596941329538822, + "learning_rate": 7.1372e-06, + "loss": 0.0232, + "num_tokens": 99696119.0, + "reward": 2.7338013648986816, + "reward_std": 0.1886308789253235, + "rewards/reward_fn/mean": 2.7338013648986816, + "rewards/reward_fn/std": 0.1886308640241623, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1288.0, + "completions/max_terminated_length": 1288.0, + "completions/mean_length": 437.3125, + "completions/mean_terminated_length": 437.3125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.22902301898801317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.028475699247792363, + "learning_rate": 7.1368e-06, + "loss": -0.198, + "num_tokens": 99739841.0, + "reward": 3.0553183555603027, + "reward_std": 0.39695262908935547, + "rewards/reward_fn/mean": 3.0553183555603027, + "rewards/reward_fn/std": 0.3969525992870331, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 102.15625, + "completions/mean_terminated_length": 102.15625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.22912909727378805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.013394350535236299, + "learning_rate": 7.1364e-06, + "loss": 0.0005, + "num_tokens": 99773542.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 370.6875, + "completions/mean_terminated_length": 370.6875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.22923517555956296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.01727902633138001, + "learning_rate": 7.136e-06, + "loss": 0.1644, + "num_tokens": 99804604.0, + "reward": 2.8176543712615967, + "reward_std": 0.04095921292901039, + "rewards/reward_fn/mean": 2.8176543712615967, + "rewards/reward_fn/std": 0.04095920920372009, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 149.78125, + "completions/mean_terminated_length": 149.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.22934125384533785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.021210170816630125, + "learning_rate": 7.1356e-06, + "loss": 0.0008, + "num_tokens": 99853013.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1352.0, + "completions/max_terminated_length": 1352.0, + "completions/mean_length": 299.90625, + "completions/mean_terminated_length": 299.90625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.22944733213111276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.028875578893348575, + "learning_rate": 7.1352e-06, + "loss": 0.0038, + "num_tokens": 99914418.0, + "reward": 2.7917895317077637, + "reward_std": 0.2923561930656433, + "rewards/reward_fn/mean": 2.7917895317077637, + "rewards/reward_fn/std": 0.2923561930656433, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 156.875, + "completions/mean_terminated_length": 156.875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.22955341041688768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.02055112540256232, + "learning_rate": 7.1348e-06, + "loss": 0.0008, + "num_tokens": 99960398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 149.875, + "completions/mean_terminated_length": 149.875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.22965948870266256, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.02863681223243475, + "learning_rate": 7.1344e-06, + "loss": -0.0067, + "num_tokens": 99982122.0, + "reward": 3.969954013824463, + "reward_std": 0.1699649542570114, + "rewards/reward_fn/mean": 3.969954013824463, + "rewards/reward_fn/std": 0.1699649542570114, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 504.25, + "completions/mean_terminated_length": 504.25, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.22976556698843748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.01761721633374691, + "learning_rate": 7.134e-06, + "loss": 0.0124, + "num_tokens": 100038514.0, + "reward": 3.1369893550872803, + "reward_std": 0.5092772841453552, + "rewards/reward_fn/mean": 3.1369893550872803, + "rewards/reward_fn/std": 0.5092772841453552, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 400.40625, + "completions/mean_terminated_length": 400.40625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.22987164527421236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.02728833886794746, + "learning_rate": 7.1336e-06, + "loss": 0.1206, + "num_tokens": 100080255.0, + "reward": 2.9163427352905273, + "reward_std": 0.20467211306095123, + "rewards/reward_fn/mean": 2.9163427352905273, + "rewards/reward_fn/std": 0.20467209815979004, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 252.90625, + "completions/mean_terminated_length": 252.90625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.22997772355998727, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.026255525881424546, + "learning_rate": 7.1332e-06, + "loss": 0.1577, + "num_tokens": 100104924.0, + "reward": 3.8455944061279297, + "reward_std": 0.3646887540817261, + "rewards/reward_fn/mean": 3.8455944061279297, + "rewards/reward_fn/std": 0.3646887540817261, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.23008380184576216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.020109534729272127, + "learning_rate": 7.132799999999999e-06, + "loss": 0.0003, + "num_tokens": 100135012.0, + "reward": 3.9615554809570312, + "reward_std": 0.2174752801656723, + "rewards/reward_fn/mean": 3.9615554809570312, + "rewards/reward_fn/std": 0.2174752801656723, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.23018988013153707, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.020035719266161323, + "learning_rate": 7.1324e-06, + "loss": 0.0479, + "num_tokens": 100157420.0, + "reward": 3.928600788116455, + "reward_std": 0.4038942754268646, + "rewards/reward_fn/mean": 3.928600788116455, + "rewards/reward_fn/std": 0.4038942754268646, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1563.0, + "completions/max_terminated_length": 1563.0, + "completions/mean_length": 414.53125, + "completions/mean_terminated_length": 414.53125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.23029595841731199, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.027303512208163738, + "learning_rate": 7.132e-06, + "loss": -0.0721, + "num_tokens": 100210237.0, + "reward": 3.4726829528808594, + "reward_std": 0.6791224479675293, + "rewards/reward_fn/mean": 3.4726829528808594, + "rewards/reward_fn/std": 0.6791225075721741, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 204.09375, + "completions/mean_terminated_length": 204.09375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.23040203670308687, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.02836342342197895, + "learning_rate": 7.1316e-06, + "loss": -0.026, + "num_tokens": 100255296.0, + "reward": 3.934906005859375, + "reward_std": 0.25615838170051575, + "rewards/reward_fn/mean": 3.934906005859375, + "rewards/reward_fn/std": 0.25615841150283813, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 283.34375, + "completions/mean_terminated_length": 283.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.23050811498886178, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.028128183213993907, + "learning_rate": 7.1312e-06, + "loss": 0.1033, + "num_tokens": 100299659.0, + "reward": 3.6705162525177, + "reward_std": 0.551688551902771, + "rewards/reward_fn/mean": 3.6705162525177, + "rewards/reward_fn/std": 0.5516886115074158, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 352.8125, + "completions/mean_terminated_length": 352.8125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.23061419327463667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.01739080680999905, + "learning_rate": 7.1307999999999996e-06, + "loss": 0.1207, + "num_tokens": 100346725.0, + "reward": 2.7642221450805664, + "reward_std": 0.028661344200372696, + "rewards/reward_fn/mean": 2.7642221450805664, + "rewards/reward_fn/std": 0.028661338612437248, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 126.09375, + "completions/mean_terminated_length": 126.09375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.23072027156041158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.011620030330959707, + "learning_rate": 7.1303999999999995e-06, + "loss": 0.0005, + "num_tokens": 100413096.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 360.71875, + "completions/mean_terminated_length": 360.71875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.2308263498461865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.024960935348644853, + "learning_rate": 7.1299999999999995e-06, + "loss": 0.0453, + "num_tokens": 100461247.0, + "reward": 3.6098623275756836, + "reward_std": 0.7507240176200867, + "rewards/reward_fn/mean": 3.6098623275756836, + "rewards/reward_fn/std": 0.7507238984107971, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 261.90625, + "completions/mean_terminated_length": 261.90625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.23093242813196138, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.02353836433030665, + "learning_rate": 7.1295999999999995e-06, + "loss": 0.0189, + "num_tokens": 100510204.0, + "reward": 2.9872453212738037, + "reward_std": 0.39124536514282227, + "rewards/reward_fn/mean": 2.9872453212738037, + "rewards/reward_fn/std": 0.3912453353404999, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 328.5625, + "completions/mean_terminated_length": 328.5625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.2310385064177363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.026427945122122765, + "learning_rate": 7.1291999999999994e-06, + "loss": 0.0886, + "num_tokens": 100564046.0, + "reward": 3.81813645362854, + "reward_std": 0.5137379169464111, + "rewards/reward_fn/mean": 3.81813645362854, + "rewards/reward_fn/std": 0.5137379169464111, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1475.0, + "completions/max_terminated_length": 1475.0, + "completions/mean_length": 349.96875, + "completions/mean_terminated_length": 349.96875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.23114458470351118, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.02832777169533074, + "learning_rate": 7.128799999999999e-06, + "loss": -0.0493, + "num_tokens": 100610381.0, + "reward": 2.8107922077178955, + "reward_std": 0.21972812712192535, + "rewards/reward_fn/mean": 2.8107922077178955, + "rewards/reward_fn/std": 0.21972811222076416, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 290.75, + "completions/mean_terminated_length": 290.75, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2312506629892861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.021952015929855406, + "learning_rate": 7.128399999999999e-06, + "loss": 0.0009, + "num_tokens": 100650533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 441.8125, + "completions/mean_terminated_length": 390.0, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.231356741275061, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.029917727690190077, + "learning_rate": 7.128e-06, + "loss": 0.2161, + "num_tokens": 100676063.0, + "reward": 2.702650547027588, + "reward_std": 0.7632204294204712, + "rewards/reward_fn/mean": 2.702650547027588, + "rewards/reward_fn/std": 0.7632204294204712, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 250.1875, + "completions/mean_terminated_length": 250.1875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.2314628195608359, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.030789189273491502, + "learning_rate": 7.1276e-06, + "loss": -0.0116, + "num_tokens": 100717029.0, + "reward": 3.1781158447265625, + "reward_std": 0.33881837129592896, + "rewards/reward_fn/mean": 3.1781158447265625, + "rewards/reward_fn/std": 0.33881837129592896, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.2315688978466108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.02682259352877736, + "learning_rate": 7.1272e-06, + "loss": 0.0011, + "num_tokens": 100746018.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 286.375, + "completions/mean_terminated_length": 286.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2316749761323857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.022002801997587085, + "learning_rate": 7.1268e-06, + "loss": 0.0009, + "num_tokens": 100793678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1891.0, + "completions/max_terminated_length": 1891.0, + "completions/mean_length": 431.5625, + "completions/mean_terminated_length": 431.5625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.2317810544181606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02914230595342815, + "learning_rate": 7.1264e-06, + "loss": 0.0501, + "num_tokens": 100838336.0, + "reward": 3.2587783336639404, + "reward_std": 0.5467434525489807, + "rewards/reward_fn/mean": 3.2587783336639404, + "rewards/reward_fn/std": 0.5467433929443359, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 211.625, + "completions/mean_terminated_length": 211.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.23188713270393552, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.033715502824634314, + "learning_rate": 7.126e-06, + "loss": 0.1925, + "num_tokens": 100882452.0, + "reward": 3.87423038482666, + "reward_std": 0.43363064527511597, + "rewards/reward_fn/mean": 3.87423038482666, + "rewards/reward_fn/std": 0.43363064527511597, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 257.4375, + "completions/mean_terminated_length": 257.4375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2319932109897104, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.020584039855748415, + "learning_rate": 7.1256e-06, + "loss": -0.0418, + "num_tokens": 100914338.0, + "reward": 3.1321568489074707, + "reward_std": 0.38132020831108093, + "rewards/reward_fn/mean": 3.1321568489074707, + "rewards/reward_fn/std": 0.3813202381134033, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 105.65625, + "completions/mean_terminated_length": 105.65625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.23209928927548532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.021333031821995974, + "learning_rate": 7.1252e-06, + "loss": -0.0669, + "num_tokens": 100957143.0, + "reward": 3.848611831665039, + "reward_std": 0.7179643511772156, + "rewards/reward_fn/mean": 3.848611831665039, + "rewards/reward_fn/std": 0.7179643511772156, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 154.25, + "completions/mean_terminated_length": 154.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2322053675612602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.02006891486234963, + "learning_rate": 7.124799999999999e-06, + "loss": 0.1186, + "num_tokens": 100992351.0, + "reward": 2.937816619873047, + "reward_std": 0.04478609934449196, + "rewards/reward_fn/mean": 2.937816619873047, + "rewards/reward_fn/std": 0.04478614032268524, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1383.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 438.84375, + "completions/mean_terminated_length": 438.84375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.23231144584703511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.02082824590615928, + "learning_rate": 7.124399999999999e-06, + "loss": 0.0008, + "num_tokens": 101040506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 199.46875, + "completions/mean_terminated_length": 199.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.23241752413281003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.028772379737347364, + "learning_rate": 7.123999999999999e-06, + "loss": 0.0012, + "num_tokens": 101063753.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2325236024185849, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.984375, + "kl": 0.022556824376806617, + "learning_rate": 7.123599999999999e-06, + "loss": 0.0406, + "num_tokens": 101085489.0, + "reward": 3.968625068664551, + "reward_std": 0.17748311161994934, + "rewards/reward_fn/mean": 3.968625068664551, + "rewards/reward_fn/std": 0.17748311161994934, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 243.8125, + "completions/mean_terminated_length": 243.8125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.23262968070435983, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.027054782956838608, + "learning_rate": 7.1232e-06, + "loss": 0.0451, + "num_tokens": 101123755.0, + "reward": 2.9340295791625977, + "reward_std": 0.03773088380694389, + "rewards/reward_fn/mean": 2.9340295791625977, + "rewards/reward_fn/std": 0.03773083910346031, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 197.21875, + "completions/mean_terminated_length": 197.21875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2327357589901347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.031393368961289525, + "learning_rate": 7.1228e-06, + "loss": 0.0013, + "num_tokens": 101164114.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 533.09375, + "completions/mean_terminated_length": 533.09375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.23284183727590962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.022313715890049934, + "learning_rate": 7.1224e-06, + "loss": -0.0249, + "num_tokens": 101220981.0, + "reward": 3.169848918914795, + "reward_std": 0.4511135220527649, + "rewards/reward_fn/mean": 3.169848918914795, + "rewards/reward_fn/std": 0.4511135220527649, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.0, + "completions/max_terminated_length": 1520.0, + "completions/mean_length": 453.90625, + "completions/mean_terminated_length": 453.90625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.2329479155616845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.02297700964845717, + "learning_rate": 7.122e-06, + "loss": -0.101, + "num_tokens": 101275314.0, + "reward": 2.5342376232147217, + "reward_std": 0.5728961229324341, + "rewards/reward_fn/mean": 2.5342376232147217, + "rewards/reward_fn/std": 0.5728961229324341, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1759.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 361.46875, + "completions/mean_terminated_length": 361.46875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.23305399384745942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.034721347503364086, + "learning_rate": 7.1216e-06, + "loss": -0.0032, + "num_tokens": 101320385.0, + "reward": 2.7689619064331055, + "reward_std": 0.25961360335350037, + "rewards/reward_fn/mean": 2.7689619064331055, + "rewards/reward_fn/std": 0.259613573551178, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 448.03125, + "completions/mean_terminated_length": 448.03125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.23316007213323434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.034088447922840714, + "learning_rate": 7.1212e-06, + "loss": -0.0266, + "num_tokens": 101363330.0, + "reward": 3.3539974689483643, + "reward_std": 0.6181737780570984, + "rewards/reward_fn/mean": 3.3539974689483643, + "rewards/reward_fn/std": 0.6181737780570984, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 352.78125, + "completions/mean_terminated_length": 352.78125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.23326615041900922, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.02180984802544117, + "learning_rate": 7.1208e-06, + "loss": 0.0872, + "num_tokens": 101414651.0, + "reward": 2.8594231605529785, + "reward_std": 0.05167490243911743, + "rewards/reward_fn/mean": 2.8594231605529785, + "rewards/reward_fn/std": 0.05167488753795624, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 328.75, + "completions/mean_terminated_length": 328.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.23337222870478413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.026791174663230777, + "learning_rate": 7.1204e-06, + "loss": 0.1349, + "num_tokens": 101459891.0, + "reward": 3.7417798042297363, + "reward_std": 0.6406286358833313, + "rewards/reward_fn/mean": 3.7417798042297363, + "rewards/reward_fn/std": 0.6406285762786865, + "step": 2200 + } + ], + "logging_steps": 1, + "max_steps": 20000, + "num_input_tokens_seen": 101459891, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}