{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.48, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 247.265625, "completions/mean_terminated_length": 144.1999969482422, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.20497874170541763, "epoch": 8e-05, "frac_reward_zero_std": 0.375, "grad_norm": 0.3721550405025482, "learning_rate": 0.0, "loss": 0.0041, "num_tokens": 97186.0, "reward": -1.59375, "reward_std": 0.6905868649482727, "rewards/reward_fn/mean": -1.59375, "rewards/reward_fn/std": 2.094001531600952, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.228990375995636, "epoch": 0.00016, "grad_norm": 0.23064066469669342, "learning_rate": 5.6000000000000005e-09, "loss": 0.0048, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.13428866863250732, "epoch": 0.00024, "frac_reward_zero_std": 0.75, "grad_norm": 0.19215697050094604, "learning_rate": 1.1200000000000001e-08, "loss": -0.0134, "num_tokens": 195490.0, "reward": -2.53125, "reward_std": 0.24491733312606812, "rewards/reward_fn/mean": -2.53125, "rewards/reward_fn/std": 1.0935566425323486, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13455941528081894, "epoch": 0.00032, "grad_norm": 0.1212029755115509, "learning_rate": 1.6799999999999998e-08, "loss": 0.0135, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 256.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 253.3984375, "completions/mean_terminated_length": 200.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.21762345731258392, "epoch": 0.0004, "frac_reward_zero_std": 0.625, "grad_norm": 0.3124823570251465, "learning_rate": 2.2400000000000002e-08, "loss": 0.0602, "num_tokens": 293461.0, "reward": -2.109375, "reward_std": 0.4436737596988678, "rewards/reward_fn/mean": -2.109375, "rewards/reward_fn/std": 1.3760285377502441, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "entropy": 0.23354104906320572, "epoch": 0.00048, "grad_norm": 0.3573931157588959, "learning_rate": 2.8e-08, "loss": -0.0602, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 256.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 254.234375, "completions/mean_terminated_length": 143.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.2360340803861618, "epoch": 0.00056, "frac_reward_zero_std": 0.25, "grad_norm": 0.5249609351158142, "learning_rate": 3.3599999999999996e-08, "loss": -0.0532, "num_tokens": 391539.0, "reward": -2.015625, "reward_std": 0.9420637488365173, "rewards/reward_fn/mean": -2.015625, "rewards/reward_fn/std": 1.4141265153884888, "step": 7 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.00018310546875, "clip_ratio/low_mean": 0.0003662109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00054931640625, "entropy": 0.2211228609085083, "epoch": 0.00064, "grad_norm": 0.43063002824783325, "learning_rate": 3.92e-08, "loss": 0.0584, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 256.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 110.85714721679688, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.2717144191265106, "epoch": 0.00072, "frac_reward_zero_std": 0.5, "grad_norm": 0.49900996685028076, "learning_rate": 4.4800000000000004e-08, "loss": -0.0535, "num_tokens": 487811.0, "reward": -1.3315614461898804, "reward_std": 0.7751052379608154, "rewards/reward_fn/mean": -1.3315614461898804, "rewards/reward_fn/std": 1.7432754039764404, "step": 9 }, { "clip_ratio/high_max": 0.0008599843131378293, "clip_ratio/high_mean": 0.00021499607828445733, "clip_ratio/low_mean": 0.00014637001731898636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003613660956034437, "entropy": 0.29148124158382416, "epoch": 0.0008, "grad_norm": 0.5239572525024414, "learning_rate": 5.0399999999999995e-08, "loss": 0.081, "step": 10 }, { "clip_ratio/high_max": 0.0002547121839597821, "clip_ratio/high_mean": 6.367804598994553e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.367804598994553e-05, "completions/clipped_ratio": 0.9140625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 245.9609375, "completions/mean_terminated_length": 139.18182373046875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.26495183259248734, "epoch": 0.00088, "frac_reward_zero_std": 0.375, "grad_norm": 0.5417622923851013, "learning_rate": 5.6e-08, "loss": 0.0809, "num_tokens": 584830.0, "reward": -1.3359375, "reward_std": 0.9135527610778809, "rewards/reward_fn/mean": -1.3359375, "rewards/reward_fn/std": 1.836966633796692, "step": 11 }, { "clip_ratio/high_max": 0.0007479189662262797, "clip_ratio/high_mean": 0.0002548039483372122, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002548039483372122, "entropy": 0.22449453175067902, "epoch": 0.00096, "grad_norm": 0.541961133480072, "learning_rate": 6.160000000000001e-08, "loss": -0.0456, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 256.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 247.2265625, "completions/mean_terminated_length": 68.83333587646484, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.20574577152729034, "epoch": 0.00104, "frac_reward_zero_std": 0.375, "grad_norm": 0.3564685583114624, "learning_rate": 6.719999999999999e-08, "loss": 0.0358, "num_tokens": 682011.0, "reward": -2.271895408630371, "reward_std": 0.8866094350814819, "rewards/reward_fn/mean": -2.271895408630371, "rewards/reward_fn/std": 1.7975612878799438, "step": 13 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "entropy": 0.25587353855371475, "epoch": 0.00112, "grad_norm": 0.5723403692245483, "learning_rate": 7.279999999999999e-08, "loss": -0.0126, "step": 14 }, { "clip_ratio/high_max": 0.0006807352183386683, "clip_ratio/high_mean": 0.0001701838045846671, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001701838045846671, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 231.4921875, "completions/mean_terminated_length": 99.1500015258789, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2521986663341522, "epoch": 0.0012, "frac_reward_zero_std": 0.25, "grad_norm": 0.39643800258636475, "learning_rate": 7.84e-08, "loss": 0.0418, "num_tokens": 777178.0, "reward": -0.9778289794921875, "reward_std": 0.9762378931045532, "rewards/reward_fn/mean": -0.9778289794921875, "rewards/reward_fn/std": 1.6498258113861084, "step": 15 }, { "clip_ratio/high_max": 0.00033355571213178337, "clip_ratio/high_mean": 0.00020545924780890346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020545924780890346, "entropy": 0.19385410845279694, "epoch": 0.00128, "grad_norm": 0.6541548371315002, "learning_rate": 8.4e-08, "loss": -0.0289, "step": 16 }, { "clip_ratio/high_max": 0.0009765625, "clip_ratio/high_mean": 0.0003228064451832324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003228064451832324, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 218.0390625, "completions/mean_terminated_length": 104.15625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.3826516717672348, "epoch": 0.00136, "frac_reward_zero_std": 0.125, "grad_norm": 1.2513238191604614, "learning_rate": 8.960000000000001e-08, "loss": 0.0045, "num_tokens": 870623.0, "reward": -0.5538802742958069, "reward_std": 1.4822357892990112, "rewards/reward_fn/mean": -0.5538802742958069, "rewards/reward_fn/std": 2.3942534923553467, "step": 17 }, { "clip_ratio/high_max": 0.001361164846457541, "clip_ratio/high_mean": 0.0007437442109221593, "clip_ratio/low_mean": 7.86163509474136e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008223605836974457, "entropy": 0.41362497210502625, "epoch": 0.00144, "grad_norm": 0.6100688576698303, "learning_rate": 9.519999999999999e-08, "loss": 0.0463, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 243.03125, "completions/mean_terminated_length": 117.66667175292969, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.3810832053422928, "epoch": 0.00152, "frac_reward_zero_std": 0.25, "grad_norm": 0.4688854515552521, "learning_rate": 1.0079999999999999e-07, "loss": -0.0514, "num_tokens": 967267.0, "reward": -1.0359331369400024, "reward_std": 1.0184823274612427, "rewards/reward_fn/mean": -1.0359331369400024, "rewards/reward_fn/std": 1.5325384140014648, "step": 19 }, { "clip_ratio/high_max": 0.0012834992667194456, "clip_ratio/high_mean": 0.00045420610695146024, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005152412632014602, "entropy": 0.30983109772205353, "epoch": 0.0016, "grad_norm": 0.7051077485084534, "learning_rate": 1.064e-07, "loss": 0.063, "step": 20 }, { "clip_ratio/high_max": 0.0004984641273040324, "clip_ratio/high_mean": 0.0001246160318260081, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001246160318260081, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 248.2421875, "completions/mean_terminated_length": 156.6999969482422, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.231474369764328, "epoch": 0.00168, "frac_reward_zero_std": 0.375, "grad_norm": 0.45522353053092957, "learning_rate": 1.12e-07, "loss": -0.058, "num_tokens": 1064578.0, "reward": -1.227344036102295, "reward_std": 0.9999254941940308, "rewards/reward_fn/mean": -1.227344036102295, "rewards/reward_fn/std": 2.2445688247680664, "step": 21 }, { "clip_ratio/high_max": 0.0010598056833259761, "clip_ratio/high_mean": 0.00032598657708149403, "clip_ratio/low_mean": 0.000184789823833853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000510776400915347, "entropy": 0.2785019278526306, "epoch": 0.00176, "grad_norm": 0.45728006958961487, "learning_rate": 1.176e-07, "loss": 0.0672, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 244.765625, "completions/mean_terminated_length": 145.38462829589844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3026137799024582, "epoch": 0.00184, "frac_reward_zero_std": 0.375, "grad_norm": 0.5769977569580078, "learning_rate": 1.2320000000000002e-07, "loss": 0.0203, "num_tokens": 1161444.0, "reward": -1.8150626420974731, "reward_std": 0.7287752032279968, "rewards/reward_fn/mean": -1.8150627613067627, "rewards/reward_fn/std": 1.7080397605895996, "step": 23 }, { "clip_ratio/high_max": 0.0011276674340479076, "clip_ratio/high_mean": 0.0004124073020648211, "clip_ratio/low_mean": 7.992327300598845e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000492330567794852, "entropy": 0.28815290331840515, "epoch": 0.00192, "grad_norm": 0.8114111423492432, "learning_rate": 1.288e-07, "loss": 0.0095, "step": 24 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 252.0546875, "completions/mean_terminated_length": 129.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.22846365720033646, "epoch": 0.002, "frac_reward_zero_std": 0.375, "grad_norm": 0.29812097549438477, "learning_rate": 1.3439999999999999e-07, "loss": 0.0017, "num_tokens": 1259243.0, "reward": -2.1997649669647217, "reward_std": 0.8282058238983154, "rewards/reward_fn/mean": -2.1997649669647217, "rewards/reward_fn/std": 1.7496846914291382, "step": 25 }, { "clip_ratio/high_max": 0.0002608241920825094, "clip_ratio/high_mean": 6.520604802062735e-05, "clip_ratio/low_mean": 0.0003115178842563182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037672392500098795, "entropy": 0.2196362167596817, "epoch": 0.00208, "grad_norm": 0.4030337631702423, "learning_rate": 1.4e-07, "loss": -0.0041, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 252.2578125, "completions/mean_terminated_length": 160.1999969482422, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.24670665711164474, "epoch": 0.00216, "frac_reward_zero_std": 0.5, "grad_norm": 0.40550342202186584, "learning_rate": 1.4559999999999998e-07, "loss": -0.0441, "num_tokens": 1357068.0, "reward": -1.171875, "reward_std": 0.8609572649002075, "rewards/reward_fn/mean": -1.171875, "rewards/reward_fn/std": 1.693476915359497, "step": 27 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.00013539565406972542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002574659665697254, "entropy": 0.2563888356089592, "epoch": 0.00224, "grad_norm": 580790.4375, "learning_rate": 1.512e-07, "loss": 19.2014, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 247.90625, "completions/mean_terminated_length": 169.6666717529297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.23536859452724457, "epoch": 0.00232, "frac_reward_zero_std": 0.25, "grad_norm": 0.4679156541824341, "learning_rate": 1.568e-07, "loss": -0.0157, "num_tokens": 1454336.0, "reward": -0.9375, "reward_std": 1.2656139135360718, "rewards/reward_fn/mean": -0.9375, "rewards/reward_fn/std": 2.154468059539795, "step": 29 }, { "clip_ratio/high_max": 0.0005064589204266667, "clip_ratio/high_mean": 0.0003111537953373045, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004332241078373045, "entropy": 0.2380518838763237, "epoch": 0.0024, "grad_norm": 0.45563456416130066, "learning_rate": 1.624e-07, "loss": 0.0292, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 133.60000610351562, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.37828435003757477, "epoch": 0.00248, "frac_reward_zero_std": 0.375, "grad_norm": 0.5373854041099548, "learning_rate": 1.68e-07, "loss": 0.0414, "num_tokens": 1550192.0, "reward": -1.5234375, "reward_std": 1.0795061588287354, "rewards/reward_fn/mean": -1.5234375, "rewards/reward_fn/std": 1.8822203874588013, "step": 31 }, { "clip_ratio/high_max": 0.001596943533513695, "clip_ratio/high_mean": 0.0006860897119622678, "clip_ratio/low_mean": 0.0002833757462212816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009694654145278037, "entropy": 0.3643944263458252, "epoch": 0.00256, "grad_norm": 0.5148366689682007, "learning_rate": 1.736e-07, "loss": 0.0202, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 249.921875, "completions/mean_terminated_length": 169.55555725097656, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.20062272995710373, "epoch": 0.00264, "frac_reward_zero_std": 0.375, "grad_norm": 0.38648128509521484, "learning_rate": 1.7920000000000002e-07, "loss": 0.0597, "num_tokens": 1647718.0, "reward": -1.1953125, "reward_std": 0.8629223108291626, "rewards/reward_fn/mean": -1.1953125, "rewards/reward_fn/std": 1.7391217947006226, "step": 33 }, { "clip_ratio/high_max": 0.00027457441319711506, "clip_ratio/high_mean": 0.00012967875227332115, "clip_ratio/low_mean": 0.00019832236284855753, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032800110056996346, "entropy": 0.19026366621255875, "epoch": 0.00272, "grad_norm": 0.5285874009132385, "learning_rate": 1.848e-07, "loss": -0.0435, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 86.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.2384214922785759, "epoch": 0.0028, "frac_reward_zero_std": 0.5, "grad_norm": 0.4211091697216034, "learning_rate": 1.9039999999999998e-07, "loss": -0.0043, "num_tokens": 1744662.0, "reward": -1.546875, "reward_std": 0.5717606544494629, "rewards/reward_fn/mean": -1.546875, "rewards/reward_fn/std": 2.061493158340454, "step": 35 }, { "clip_ratio/high_max": 0.0007519943173974752, "clip_ratio/high_mean": 0.0002490337355993688, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002490337355993688, "entropy": 0.23527035117149353, "epoch": 0.00288, "grad_norm": 0.36768314242362976, "learning_rate": 1.96e-07, "loss": 0.0202, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.11333256587386131, "epoch": 0.00296, "frac_reward_zero_std": 0.5, "grad_norm": 0.2922755479812622, "learning_rate": 2.0159999999999998e-07, "loss": -0.0238, "num_tokens": 1842966.0, "reward": -1.5234375, "reward_std": 0.6406084895133972, "rewards/reward_fn/mean": -1.5234375, "rewards/reward_fn/std": 1.5057100057601929, "step": 37 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "entropy": 0.12424052879214287, "epoch": 0.00304, "grad_norm": 0.2968105673789978, "learning_rate": 2.072e-07, "loss": 0.0239, "step": 38 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.953125, "completions/max_length": 256.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 251.109375, "completions/mean_terminated_length": 151.6666717529297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21012968569993973, "epoch": 0.00312, "frac_reward_zero_std": 0.125, "grad_norm": 0.4987395107746124, "learning_rate": 2.128e-07, "loss": 0.0686, "num_tokens": 1940644.0, "reward": -2.0625, "reward_std": 1.0469868183135986, "rewards/reward_fn/mean": -2.0625, "rewards/reward_fn/std": 1.3960011005401611, "step": 39 }, { "clip_ratio/high_max": 0.0002820079098455608, "clip_ratio/high_mean": 0.0001925722899613902, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003146426170133054, "entropy": 0.22067424654960632, "epoch": 0.0032, "grad_norm": 0.4304920732975006, "learning_rate": 2.184e-07, "loss": -0.0594, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8359375, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 234.4296875, "completions/mean_terminated_length": 124.52381134033203, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.3134491890668869, "epoch": 0.00328, "frac_reward_zero_std": 0.5, "grad_norm": 0.526411235332489, "learning_rate": 2.24e-07, "loss": 0.0079, "num_tokens": 2036187.0, "reward": -0.375, "reward_std": 0.9235644340515137, "rewards/reward_fn/mean": -0.375, "rewards/reward_fn/std": 1.8823673725128174, "step": 41 }, { "clip_ratio/high_max": 0.0005305094819050282, "clip_ratio/high_mean": 0.00019366252672625706, "clip_ratio/low_mean": 6.513809057651088e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025880062457872555, "entropy": 0.334441214799881, "epoch": 0.00336, "grad_norm": 0.5599517226219177, "learning_rate": 2.2960000000000002e-07, "loss": 0.0007, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 256.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 249.5234375, "completions/mean_terminated_length": 117.83333587646484, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.21662289649248123, "epoch": 0.00344, "frac_reward_zero_std": 0.5, "grad_norm": 0.35999858379364014, "learning_rate": 2.352e-07, "loss": 0.0849, "num_tokens": 2133662.0, "reward": -1.7109375, "reward_std": 0.842147707939148, "rewards/reward_fn/mean": -1.7109375, "rewards/reward_fn/std": 1.6702696084976196, "step": 43 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00030517578125, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003662109375, "entropy": 0.17115550488233566, "epoch": 0.00352, "grad_norm": 0.40192079544067383, "learning_rate": 2.408e-07, "loss": -0.0828, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012953368423040956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012953368423040956, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 243.0234375, "completions/mean_terminated_length": 117.58333587646484, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.29287000000476837, "epoch": 0.0036, "frac_reward_zero_std": 0.375, "grad_norm": 0.5879268646240234, "learning_rate": 2.4640000000000004e-07, "loss": -0.0233, "num_tokens": 2230305.0, "reward": -1.640625, "reward_std": 0.8780421018600464, "rewards/reward_fn/mean": -1.640625, "rewards/reward_fn/std": 2.0224502086639404, "step": 45 }, { "clip_ratio/high_max": 0.0005688281962648034, "clip_ratio/high_mean": 0.00028329057386144996, "clip_ratio/low_mean": 7.807620568200946e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003613667795434594, "entropy": 0.24025459587574005, "epoch": 0.00368, "grad_norm": 0.3391072154045105, "learning_rate": 2.52e-07, "loss": 0.0552, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.144713893532753, "epoch": 0.00376, "frac_reward_zero_std": 0.125, "grad_norm": 0.37604060769081116, "learning_rate": 2.576e-07, "loss": 0.0408, "num_tokens": 2328609.0, "reward": -1.8483309745788574, "reward_std": 1.0483298301696777, "rewards/reward_fn/mean": -1.8483309745788574, "rewards/reward_fn/std": 1.4685181379318237, "step": 47 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00030517578125, "clip_ratio/low_mean": 0.00018310546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00048828125, "entropy": 0.16569995135068893, "epoch": 0.00384, "grad_norm": 0.5170264840126038, "learning_rate": 2.6320000000000003e-07, "loss": -0.0407, "step": 48 }, { "clip_ratio/high_max": 0.00027382257394492626, "clip_ratio/high_mean": 6.845564348623157e-05, "clip_ratio/low_mean": 0.00019978854834334925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002682441772776656, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 249.5625, "completions/mean_terminated_length": 164.44444274902344, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.21613524854183197, "epoch": 0.00392, "frac_reward_zero_std": 0.5, "grad_norm": 0.4348979592323303, "learning_rate": 2.6879999999999997e-07, "loss": -0.0419, "num_tokens": 2426089.0, "reward": -1.4296875, "reward_std": 0.4093368649482727, "rewards/reward_fn/mean": -1.4296875, "rewards/reward_fn/std": 1.5042386054992676, "step": 49 }, { "clip_ratio/high_max": 0.0008192244567908347, "clip_ratio/high_mean": 0.00020480611419770867, "clip_ratio/low_mean": 0.00020185173343634233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004066578403580934, "entropy": 0.18501276522874832, "epoch": 0.004, "grad_norm": 2.492398738861084, "learning_rate": 2.7439999999999997e-07, "loss": 0.0489, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 256.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 242.7890625, "completions/mean_terminated_length": 162.05555725097656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3047449067234993, "epoch": 0.00408, "frac_reward_zero_std": 0.25, "grad_norm": 0.5058755874633789, "learning_rate": 2.8e-07, "loss": -0.0625, "num_tokens": 2522702.0, "reward": -1.040299654006958, "reward_std": 1.2774872779846191, "rewards/reward_fn/mean": -1.040299654006958, "rewards/reward_fn/std": 2.307407855987549, "step": 51 }, { "clip_ratio/high_max": 0.0009557804150972515, "clip_ratio/high_mean": 0.0003070278908126056, "clip_ratio/low_mean": 0.00019987006089650095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005068979371571913, "entropy": 0.4202980548143387, "epoch": 0.00416, "grad_norm": 0.5943037867546082, "learning_rate": 2.856e-07, "loss": 0.0702, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.13121189922094345, "epoch": 0.00424, "frac_reward_zero_std": 0.5, "grad_norm": 0.27478551864624023, "learning_rate": 2.9119999999999996e-07, "loss": -0.018, "num_tokens": 2621006.0, "reward": -1.734375, "reward_std": 0.5690595507621765, "rewards/reward_fn/mean": -1.734375, "rewards/reward_fn/std": 1.4873977899551392, "step": 53 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00048828125, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00054931640625, "entropy": 0.13558614253997803, "epoch": 0.00432, "grad_norm": 1.8553444147109985, "learning_rate": 2.968e-07, "loss": 0.0183, "step": 54 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 245.8203125, "completions/mean_terminated_length": 147.4166717529297, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.26927511394023895, "epoch": 0.0044, "frac_reward_zero_std": 0.5, "grad_norm": 0.44605177640914917, "learning_rate": 3.024e-07, "loss": -0.0337, "num_tokens": 2718007.0, "reward": -1.537571907043457, "reward_std": 0.6633661985397339, "rewards/reward_fn/mean": -1.537571907043457, "rewards/reward_fn/std": 1.562010407447815, "step": 55 }, { "clip_ratio/high_max": 0.0007774927944410592, "clip_ratio/high_mean": 0.00032971044129226357, "clip_ratio/low_mean": 6.329113966785371e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003930015809601173, "entropy": 0.27154217660427094, "epoch": 0.00448, "grad_norm": 0.5136521458625793, "learning_rate": 3.0799999999999995e-07, "loss": 0.0541, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 249.046875, "completions/mean_terminated_length": 144.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.18035631626844406, "epoch": 0.00456, "frac_reward_zero_std": 0.625, "grad_norm": 0.30989113450050354, "learning_rate": 3.136e-07, "loss": 0.0386, "num_tokens": 2815421.0, "reward": -1.546875, "reward_std": 0.635059118270874, "rewards/reward_fn/mean": -1.546875, "rewards/reward_fn/std": 1.8817790746688843, "step": 57 }, { "clip_ratio/high_max": 0.0005117707187309861, "clip_ratio/high_mean": 0.00018897783593274653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018897783593274653, "entropy": 0.15690182149410248, "epoch": 0.00464, "grad_norm": 0.3770614564418793, "learning_rate": 3.192e-07, "loss": -0.0176, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 252.765625, "completions/mean_terminated_length": 173.1999969482422, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22265951335430145, "epoch": 0.00472, "frac_reward_zero_std": 0.5, "grad_norm": 0.4115305244922638, "learning_rate": 3.248e-07, "loss": -0.024, "num_tokens": 2913311.0, "reward": -2.296875, "reward_std": 0.6707919836044312, "rewards/reward_fn/mean": -2.296875, "rewards/reward_fn/std": 1.2758160829544067, "step": 59 }, { "clip_ratio/high_max": 0.0005115089588798583, "clip_ratio/high_mean": 0.00012787723971996456, "clip_ratio/low_mean": 6.393861985998228e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019181585230398923, "entropy": 0.19815383106470108, "epoch": 0.0048, "grad_norm": 0.30694642663002014, "learning_rate": 3.304e-07, "loss": 0.0311, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8046875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 232.1953125, "completions/mean_terminated_length": 134.1199951171875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.3968072533607483, "epoch": 0.00488, "frac_reward_zero_std": 0.5, "grad_norm": 0.5265905261039734, "learning_rate": 3.36e-07, "loss": 0.0336, "num_tokens": 3008568.0, "reward": -0.6087501049041748, "reward_std": 0.9391759634017944, "rewards/reward_fn/mean": -0.6087501049041748, "rewards/reward_fn/std": 1.740141749382019, "step": 61 }, { "clip_ratio/high_max": 0.00350526359397918, "clip_ratio/high_mean": 0.001015601388644427, "clip_ratio/low_mean": 7.042253855615854e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010860239272005856, "entropy": 0.3733830004930496, "epoch": 0.00496, "grad_norm": 0.5103439688682556, "learning_rate": 3.4160000000000004e-07, "loss": 0.0074, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9609375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 251.53125, "completions/mean_terminated_length": 141.60000610351562, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.24838313460350037, "epoch": 0.00504, "frac_reward_zero_std": 0.375, "grad_norm": 0.5016983151435852, "learning_rate": 3.472e-07, "loss": -0.0062, "num_tokens": 3106300.0, "reward": -1.96875, "reward_std": 0.9084128141403198, "rewards/reward_fn/mean": -1.96875, "rewards/reward_fn/std": 1.4791862964630127, "step": 63 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0001361825197818689, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002582528395578265, "entropy": 0.19091619551181793, "epoch": 0.00512, "grad_norm": 0.3490804433822632, "learning_rate": 3.528e-07, "loss": 0.0154, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 244.7890625, "completions/mean_terminated_length": 166.3125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2882993221282959, "epoch": 0.0052, "frac_reward_zero_std": 0.375, "grad_norm": 0.3705078363418579, "learning_rate": 3.5840000000000003e-07, "loss": 0.0032, "num_tokens": 3203169.0, "reward": -1.8046875, "reward_std": 0.9070758819580078, "rewards/reward_fn/mean": -1.8046875, "rewards/reward_fn/std": 1.7391217947006226, "step": 65 }, { "clip_ratio/high_max": 0.0013977173657622188, "clip_ratio/high_mean": 0.0005559021083172411, "clip_ratio/low_mean": 0.0004539448709692806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010098469792865217, "entropy": 0.35213805735111237, "epoch": 0.00528, "grad_norm": 0.5411931276321411, "learning_rate": 3.64e-07, "loss": 0.0293, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.12102095037698746, "epoch": 0.00536, "frac_reward_zero_std": 0.5, "grad_norm": 0.2750786542892456, "learning_rate": 3.696e-07, "loss": 0.0152, "num_tokens": 3301473.0, "reward": -1.171875, "reward_std": 0.4324173331260681, "rewards/reward_fn/mean": -1.171875, "rewards/reward_fn/std": 1.4694225788116455, "step": 67 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "entropy": 0.12700073421001434, "epoch": 0.00544, "grad_norm": 0.2131596803665161, "learning_rate": 3.752e-07, "loss": -0.0151, "step": 68 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0002884364075725898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003494715638225898, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 243.765625, "completions/mean_terminated_length": 135.53846740722656, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.2975913733243942, "epoch": 0.00552, "frac_reward_zero_std": 0.25, "grad_norm": 0.4989601671695709, "learning_rate": 3.8079999999999997e-07, "loss": -0.0221, "num_tokens": 3398211.0, "reward": -1.2108128070831299, "reward_std": 1.1278789043426514, "rewards/reward_fn/mean": -1.2108128070831299, "rewards/reward_fn/std": 1.8290295600891113, "step": 69 }, { "clip_ratio/high_max": 0.0005291469569783658, "clip_ratio/high_mean": 0.00013228673924459144, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019332190277054906, "entropy": 0.23015834391117096, "epoch": 0.0056, "grad_norm": 0.49512380361557007, "learning_rate": 3.8639999999999997e-07, "loss": 0.0532, "step": 70 }, { "clip_ratio/high_max": 0.001220703125, "clip_ratio/high_mean": 0.00030517578125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030517578125, "completions/clipped_ratio": 0.8671875, "completions/max_length": 256.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 240.078125, "completions/mean_terminated_length": 136.11764526367188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.28080469369888306, "epoch": 0.00568, "frac_reward_zero_std": 0.375, "grad_norm": 0.5551792979240417, "learning_rate": 3.92e-07, "loss": -0.0499, "num_tokens": 3494477.0, "reward": -1.640625, "reward_std": 0.9311495423316956, "rewards/reward_fn/mean": -1.640625, "rewards/reward_fn/std": 1.9144471883773804, "step": 71 }, { "clip_ratio/high_max": 0.0015854751109145582, "clip_ratio/high_mean": 0.0006973300041863695, "clip_ratio/low_mean": 7.876496238168329e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007760949229123071, "entropy": 0.25071630626916885, "epoch": 0.00576, "grad_norm": 0.4742331802845001, "learning_rate": 3.976e-07, "loss": 0.053, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.17269732803106308, "epoch": 0.00584, "frac_reward_zero_std": 0.5, "grad_norm": 0.9635801911354065, "learning_rate": 4.0319999999999996e-07, "loss": -0.0019, "num_tokens": 3592781.0, "reward": -1.875, "reward_std": 0.680052638053894, "rewards/reward_fn/mean": -1.875, "rewards/reward_fn/std": 1.4580755233764648, "step": 73 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.00018310546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030517578125, "entropy": 0.15371644496917725, "epoch": 0.00592, "grad_norm": 0.3887043595314026, "learning_rate": 4.088e-07, "loss": 0.0019, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8359375, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 236.8984375, "completions/mean_terminated_length": 139.57142639160156, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.34394970536231995, "epoch": 0.006, "frac_reward_zero_std": 0.625, "grad_norm": 0.461883008480072, "learning_rate": 4.144e-07, "loss": 0.0407, "num_tokens": 3688640.0, "reward": -0.9398665428161621, "reward_std": 0.5102619528770447, "rewards/reward_fn/mean": -0.9398664832115173, "rewards/reward_fn/std": 1.4724981784820557, "step": 75 }, { "clip_ratio/high_max": 0.0008613616810180247, "clip_ratio/high_mean": 0.0003585555241443217, "clip_ratio/low_mean": 6.151574780233204e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004200712719466537, "entropy": 0.33506855368614197, "epoch": 0.00608, "grad_norm": 0.3789639472961426, "learning_rate": 4.2e-07, "loss": 0.0058, "step": 76 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00018310546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018310546875, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 154.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.28352929651737213, "epoch": 0.00616, "frac_reward_zero_std": 0.5, "grad_norm": 0.5598105788230896, "learning_rate": 4.256e-07, "loss": -0.0414, "num_tokens": 3785720.0, "reward": -1.5703125, "reward_std": 0.7283279895782471, "rewards/reward_fn/mean": -1.5703125, "rewards/reward_fn/std": 1.8429845571517944, "step": 77 }, { "clip_ratio/high_max": 0.0005074371001683176, "clip_ratio/high_mean": 0.0001268592750420794, "clip_ratio/low_mean": 0.0001299056748393923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002567649498814717, "entropy": 0.25434861332178116, "epoch": 0.00624, "grad_norm": 1.2740339040756226, "learning_rate": 4.312e-07, "loss": 0.041, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8515625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 236.734375, "completions/mean_terminated_length": 126.21052551269531, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.33866873383522034, "epoch": 0.00632, "frac_reward_zero_std": 0.25, "grad_norm": 0.5125900506973267, "learning_rate": 4.368e-07, "loss": 0.062, "num_tokens": 3881558.0, "reward": -1.5223575830459595, "reward_std": 1.1045864820480347, "rewards/reward_fn/mean": -1.5223575830459595, "rewards/reward_fn/std": 1.8278412818908691, "step": 79 }, { "clip_ratio/high_max": 0.0018318392103537917, "clip_ratio/high_mean": 0.0005399270157795399, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006619972991757095, "entropy": 0.32352712750434875, "epoch": 0.0064, "grad_norm": 0.6005949974060059, "learning_rate": 4.4240000000000004e-07, "loss": -0.0292, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.0997256226837635, "epoch": 0.00648, "frac_reward_zero_std": 0.75, "grad_norm": 0.1959279328584671, "learning_rate": 4.48e-07, "loss": -0.0047, "num_tokens": 3979862.0, "reward": -2.765625, "reward_std": 0.32173603773117065, "rewards/reward_fn/mean": -2.765625, "rewards/reward_fn/std": 0.8082680702209473, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10259614512324333, "epoch": 0.00656, "grad_norm": 0.29272061586380005, "learning_rate": 4.536e-07, "loss": 0.0047, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9453125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 250.0390625, "completions/mean_terminated_length": 147.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.21233946084976196, "epoch": 0.00664, "frac_reward_zero_std": 0.25, "grad_norm": 0.4911869466304779, "learning_rate": 4.5920000000000004e-07, "loss": 0.1056, "num_tokens": 4077403.0, "reward": -0.84375, "reward_std": 1.2078179121017456, "rewards/reward_fn/mean": -0.84375, "rewards/reward_fn/std": 1.8030486106872559, "step": 83 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018310546875, "entropy": 0.22626448422670364, "epoch": 0.00672, "grad_norm": 0.5197195410728455, "learning_rate": 4.648e-07, "loss": -0.0957, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 148.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.1789463683962822, "epoch": 0.0068, "frac_reward_zero_std": 0.5, "grad_norm": 0.49142077565193176, "learning_rate": 4.704e-07, "loss": -0.008, "num_tokens": 4174735.0, "reward": -0.515625, "reward_std": 0.6872347593307495, "rewards/reward_fn/mean": -0.515625, "rewards/reward_fn/std": 1.8485337495803833, "step": 85 }, { "clip_ratio/high_max": 0.0016334188403561711, "clip_ratio/high_mean": 0.0005304250225890428, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006524953350890428, "entropy": 0.21129196137189865, "epoch": 0.00688, "grad_norm": 0.27174097299575806, "learning_rate": 4.7600000000000003e-07, "loss": 0.0157, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 252.2265625, "completions/mean_terminated_length": 135.25, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.17993124574422836, "epoch": 0.00696, "frac_reward_zero_std": 0.125, "grad_norm": 0.4689631760120392, "learning_rate": 4.816e-07, "loss": 0.0635, "num_tokens": 4272556.0, "reward": -1.3237829208374023, "reward_std": 1.1191099882125854, "rewards/reward_fn/mean": -1.3237829208374023, "rewards/reward_fn/std": 1.5570586919784546, "step": 87 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00018310546875, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000244140625, "entropy": 0.17580361664295197, "epoch": 0.00704, "grad_norm": 0.31532078981399536, "learning_rate": 4.872e-07, "loss": -0.0436, "step": 88 }, { "clip_ratio/high_max": 0.00027292576851323247, "clip_ratio/high_mean": 6.823144212830812e-05, "clip_ratio/low_mean": 6.720430246787146e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013543574459617957, "completions/clipped_ratio": 0.890625, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 244.9921875, "completions/mean_terminated_length": 155.35714721679688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.230585977435112, "epoch": 0.00712, "frac_reward_zero_std": 0.125, "grad_norm": 0.4844881594181061, "learning_rate": 4.928000000000001e-07, "loss": -0.0105, "num_tokens": 4369451.0, "reward": -0.890625, "reward_std": 1.2002447843551636, "rewards/reward_fn/mean": -0.890625, "rewards/reward_fn/std": 1.6131075620651245, "step": 89 }, { "clip_ratio/high_max": 0.0007607510196976364, "clip_ratio/high_mean": 0.00025699695106595755, "clip_ratio/low_mean": 0.0002453564666211605, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005023534031352028, "entropy": 0.23140602558851242, "epoch": 0.0072, "grad_norm": 24.592012405395508, "learning_rate": 4.984e-07, "loss": 0.0739, "step": 90 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00018310546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018310546875, "completions/clipped_ratio": 0.9453125, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 251.5234375, "completions/mean_terminated_length": 174.1428680419922, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.20281335711479187, "epoch": 0.00728, "frac_reward_zero_std": 0.5, "grad_norm": 0.2169298231601715, "learning_rate": 5.04e-07, "loss": 0.0113, "num_tokens": 4467182.0, "reward": -1.600629210472107, "reward_std": 0.5131863355636597, "rewards/reward_fn/mean": -1.6006290912628174, "rewards/reward_fn/std": 1.5203639268875122, "step": 91 }, { "clip_ratio/high_max": 0.001071949431207031, "clip_ratio/high_mean": 0.00026798735780175775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026798735780175775, "entropy": 0.18920914828777313, "epoch": 0.00736, "grad_norm": 0.5543444156646729, "learning_rate": 5.096000000000001e-07, "loss": -0.0071, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 234.6484375, "completions/mean_terminated_length": 142.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.3256616145372391, "epoch": 0.00744, "frac_reward_zero_std": 0.25, "grad_norm": 0.6728318333625793, "learning_rate": 5.152e-07, "loss": -0.0113, "num_tokens": 4562753.0, "reward": -1.3465825319290161, "reward_std": 1.0585445165634155, "rewards/reward_fn/mean": -1.3465825319290161, "rewards/reward_fn/std": 2.1994030475616455, "step": 93 }, { "clip_ratio/high_max": 0.0008284813375212252, "clip_ratio/high_mean": 0.0002790007274597883, "clip_ratio/low_mean": 0.0006518793816212565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009308800799772143, "entropy": 0.33223310112953186, "epoch": 0.00752, "grad_norm": 0.4726489186286926, "learning_rate": 5.208e-07, "loss": 0.0599, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.8046875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.4921875, "completions/mean_terminated_length": 140.75999450683594, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.3065023422241211, "epoch": 0.0076, "frac_reward_zero_std": 0.25, "grad_norm": 0.5653518438339233, "learning_rate": 5.264000000000001e-07, "loss": 0.0452, "num_tokens": 4658176.0, "reward": -0.4365464448928833, "reward_std": 0.9659615755081177, "rewards/reward_fn/mean": -0.4365464448928833, "rewards/reward_fn/std": 1.611289381980896, "step": 95 }, { "clip_ratio/high_max": 0.001789108500815928, "clip_ratio/high_mean": 0.000672367139486596, "clip_ratio/low_mean": 0.0002044085194938816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008767756808083504, "entropy": 0.3114447742700577, "epoch": 0.00768, "grad_norm": 0.6383634805679321, "learning_rate": 5.319999999999999e-07, "loss": -0.0148, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.1643480360507965, "epoch": 0.00776, "frac_reward_zero_std": 0.5, "grad_norm": 0.152774378657341, "learning_rate": 5.375999999999999e-07, "loss": 0.0004, "num_tokens": 4756480.0, "reward": -1.078125, "reward_std": 0.48329198360443115, "rewards/reward_fn/mean": -1.078125, "rewards/reward_fn/std": 1.4451078176498413, "step": 97 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "entropy": 0.1619122475385666, "epoch": 0.00784, "grad_norm": 0.4099648892879486, "learning_rate": 5.432e-07, "loss": -0.0005, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "completions/clipped_ratio": 0.8671875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 239.234375, "completions/mean_terminated_length": 129.76470947265625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.242280051112175, "epoch": 0.00792, "frac_reward_zero_std": 0.5, "grad_norm": 0.8192784786224365, "learning_rate": 5.487999999999999e-07, "loss": -0.0069, "num_tokens": 4852638.0, "reward": -0.5835965871810913, "reward_std": 0.7445806860923767, "rewards/reward_fn/mean": -0.5835965871810913, "rewards/reward_fn/std": 2.261319875717163, "step": 99 }, { "clip_ratio/high_max": 0.0019230223842896521, "clip_ratio/high_mean": 0.001088144286768511, "clip_ratio/low_mean": 0.00013886182568967342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012270061124581844, "entropy": 0.28010980784893036, "epoch": 0.008, "grad_norm": 0.38331395387649536, "learning_rate": 5.544e-07, "loss": 0.003, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 255.703125, "completions/mean_terminated_length": 218.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.1292671039700508, "epoch": 0.00808, "frac_reward_zero_std": 0.25, "grad_norm": 0.5328848361968994, "learning_rate": 5.6e-07, "loss": 0.0063, "num_tokens": 4950904.0, "reward": -1.5234375, "reward_std": 0.7116715312004089, "rewards/reward_fn/mean": -1.5234375, "rewards/reward_fn/std": 1.5057100057601929, "step": 101 }, { "clip_ratio/high_max": 0.00024875623057596385, "clip_ratio/high_mean": 6.218905764399096e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.218905764399096e-05, "entropy": 0.12978144735097885, "epoch": 0.00816, "grad_norm": 0.42954936623573303, "learning_rate": 5.655999999999999e-07, "loss": -0.0019, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 229.1953125, "completions/mean_terminated_length": 113.04167175292969, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2555716782808304, "epoch": 0.00824, "frac_reward_zero_std": 0.375, "grad_norm": 0.6439260244369507, "learning_rate": 5.712e-07, "loss": 0.0516, "num_tokens": 5045777.0, "reward": -1.4970316886901855, "reward_std": 0.7675368189811707, "rewards/reward_fn/mean": -1.4970316886901855, "rewards/reward_fn/std": 2.0298619270324707, "step": 103 }, { "clip_ratio/high_max": 0.0005603961471933872, "clip_ratio/high_mean": 0.0002077763492707163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002077763492707163, "entropy": 0.23489750921726227, "epoch": 0.00832, "grad_norm": 0.342069149017334, "learning_rate": 5.768e-07, "loss": -0.0042, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 249.9609375, "completions/mean_terminated_length": 170.11111450195312, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.1841924637556076, "epoch": 0.0084, "frac_reward_zero_std": 0.75, "grad_norm": 0.38977959752082825, "learning_rate": 5.823999999999999e-07, "loss": 0.0101, "num_tokens": 5143308.0, "reward": -1.3293828964233398, "reward_std": 0.3290848731994629, "rewards/reward_fn/mean": -1.3293828964233398, "rewards/reward_fn/std": 1.5492980480194092, "step": 105 }, { "clip_ratio/high_max": 0.0010400415631011128, "clip_ratio/high_mean": 0.0003820807032752782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003820807032752782, "entropy": 0.22631845623254776, "epoch": 0.00848, "grad_norm": 0.160371333360672, "learning_rate": 5.88e-07, "loss": -0.0083, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9140625, "completions/max_length": 256.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 247.78125, "completions/mean_terminated_length": 160.3636474609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.23016710579395294, "epoch": 0.00856, "frac_reward_zero_std": 0.5, "grad_norm": 5.846016883850098, "learning_rate": 5.936e-07, "loss": 0.0614, "num_tokens": 5240560.0, "reward": -1.078125, "reward_std": 0.7756542563438416, "rewards/reward_fn/mean": -1.078125, "rewards/reward_fn/std": 1.9826387166976929, "step": 107 }, { "clip_ratio/high_max": 0.0018115942366421223, "clip_ratio/high_mean": 0.00045289855916053057, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045289855916053057, "entropy": 0.23424892127513885, "epoch": 0.00864, "grad_norm": 0.3983052372932434, "learning_rate": 5.991999999999999e-07, "loss": -0.0634, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 248.8828125, "completions/mean_terminated_length": 164.90000915527344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16912409663200378, "epoch": 0.00872, "frac_reward_zero_std": 0.75, "grad_norm": 0.21540196239948273, "learning_rate": 6.048e-07, "loss": 0.023, "num_tokens": 5337953.0, "reward": -1.2079191207885742, "reward_std": 0.2519221305847168, "rewards/reward_fn/mean": -1.2079191207885742, "rewards/reward_fn/std": 1.4898076057434082, "step": 109 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018310546875, "entropy": 0.1654917076230049, "epoch": 0.0088, "grad_norm": 0.13835883140563965, "learning_rate": 6.104e-07, "loss": -0.023, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.7734375, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 229.953125, "completions/mean_terminated_length": 141.03448486328125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.2892129123210907, "epoch": 0.00888, "frac_reward_zero_std": 0.375, "grad_norm": 0.49843287467956543, "learning_rate": 6.159999999999999e-07, "loss": 0.0315, "num_tokens": 5432923.0, "reward": -1.939003348350525, "reward_std": 0.8338161706924438, "rewards/reward_fn/mean": -1.939003348350525, "rewards/reward_fn/std": 1.588358759880066, "step": 111 }, { "clip_ratio/high_max": 0.0028263938147574663, "clip_ratio/high_mean": 0.0008521501731593162, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008521501731593162, "entropy": 0.29470881819725037, "epoch": 0.00896, "grad_norm": 0.5346113443374634, "learning_rate": 6.216e-07, "loss": -0.0204, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.078142516547814e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.078142516547814e-05, "completions/clipped_ratio": 0.9140625, "completions/max_length": 256.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 243.8125, "completions/mean_terminated_length": 114.18182373046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.1579204648733139, "epoch": 0.00904, "frac_reward_zero_std": 0.5, "grad_norm": 0.30824682116508484, "learning_rate": 6.272e-07, "loss": -0.0402, "num_tokens": 5529667.0, "reward": -1.0811654329299927, "reward_std": 0.7100537419319153, "rewards/reward_fn/mean": -1.0811654329299927, "rewards/reward_fn/std": 2.4188928604125977, "step": 113 }, { "clip_ratio/high_max": 0.00026968715246766806, "clip_ratio/high_mean": 6.742178811691701e-05, "clip_ratio/low_mean": 7.822277984814718e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001456445679650642, "entropy": 0.17163921892642975, "epoch": 0.00912, "grad_norm": 0.3274862766265869, "learning_rate": 6.328e-07, "loss": 0.0653, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 118.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.45922812819480896, "epoch": 0.0092, "frac_reward_zero_std": 0.625, "grad_norm": 0.4612703025341034, "learning_rate": 6.384e-07, "loss": -0.0178, "num_tokens": 5623579.0, "reward": -0.41876035928726196, "reward_std": 0.5210762619972229, "rewards/reward_fn/mean": -0.41876035928726196, "rewards/reward_fn/std": 2.1214940547943115, "step": 115 }, { "clip_ratio/high_max": 0.002748774306382984, "clip_ratio/high_mean": 0.000992051893263124, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000992051893263124, "entropy": 0.35550400614738464, "epoch": 0.00928, "grad_norm": 0.4987260103225708, "learning_rate": 6.44e-07, "loss": 0.0096, "step": 116 }, { "clip_ratio/high_max": 0.0011032471666112542, "clip_ratio/high_mean": 0.00027581179165281355, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027581179165281355, "completions/clipped_ratio": 0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.765625, "completions/mean_terminated_length": 138.9545440673828, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.36206501722335815, "epoch": 0.00936, "frac_reward_zero_std": 0.375, "grad_norm": 0.6193500757217407, "learning_rate": 6.496e-07, "loss": -0.0122, "num_tokens": 5716733.0, "reward": -0.8445625901222229, "reward_std": 0.8682079911231995, "rewards/reward_fn/mean": -0.8445625901222229, "rewards/reward_fn/std": 1.9873878955841064, "step": 117 }, { "clip_ratio/high_max": 0.002753317472524941, "clip_ratio/high_mean": 0.0009181115019600838, "clip_ratio/low_mean": 0.0002820224399329163, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012001339346170425, "entropy": 0.39876261353492737, "epoch": 0.00944, "grad_norm": 0.5605751276016235, "learning_rate": 6.552e-07, "loss": 0.0001, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 242.4453125, "completions/mean_terminated_length": 122.53846740722656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.2587617486715317, "epoch": 0.00952, "frac_reward_zero_std": 0.25, "grad_norm": 0.5665695667266846, "learning_rate": 6.608e-07, "loss": 0.0467, "num_tokens": 5813302.0, "reward": -1.496106743812561, "reward_std": 0.8066415190696716, "rewards/reward_fn/mean": -1.496106743812561, "rewards/reward_fn/std": 1.509921908378601, "step": 119 }, { "clip_ratio/high_max": 0.00095175820752047, "clip_ratio/high_mean": 0.0003600098716560751, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004210450279060751, "entropy": 0.27177658677101135, "epoch": 0.0096, "grad_norm": 0.40799498558044434, "learning_rate": 6.664e-07, "loss": -0.0494, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 255.78125, "completions/mean_terminated_length": 228.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.13357285410165787, "epoch": 0.00968, "frac_reward_zero_std": 0.125, "grad_norm": 0.37464702129364014, "learning_rate": 6.72e-07, "loss": -0.0382, "num_tokens": 5911578.0, "reward": -1.4903607368469238, "reward_std": 1.1849586963653564, "rewards/reward_fn/mean": -1.4903607368469238, "rewards/reward_fn/std": 1.5174853801727295, "step": 121 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.00018310546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000244140625, "entropy": 0.14310607314109802, "epoch": 0.00976, "grad_norm": 0.3581203520298004, "learning_rate": 6.776e-07, "loss": 0.0392, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 245.0390625, "completions/mean_terminated_length": 115.70000457763672, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.14216671139001846, "epoch": 0.00984, "frac_reward_zero_std": 0.375, "grad_norm": 1.137184500694275, "learning_rate": 6.832000000000001e-07, "loss": -0.01, "num_tokens": 6008479.0, "reward": -1.5703125, "reward_std": 0.7348353862762451, "rewards/reward_fn/mean": -1.5703125, "rewards/reward_fn/std": 2.0949268341064453, "step": 123 }, { "clip_ratio/high_max": 0.0009038824064191431, "clip_ratio/high_mean": 0.00028700575785478577, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034804089955287054, "entropy": 0.16693215817213058, "epoch": 0.00992, "grad_norm": 0.22702915966510773, "learning_rate": 6.888e-07, "loss": 0.0211, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9140625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 245.7578125, "completions/mean_terminated_length": 136.8181915283203, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.22275760769844055, "epoch": 0.01, "frac_reward_zero_std": 0.375, "grad_norm": 0.38381433486938477, "learning_rate": 6.944e-07, "loss": -0.0126, "num_tokens": 6105472.0, "reward": -1.3359375, "reward_std": 0.8360899686813354, "rewards/reward_fn/mean": -1.3359375, "rewards/reward_fn/std": 1.5434767007827759, "step": 125 }, { "clip_ratio/high_max": 0.0008434864867012948, "clip_ratio/high_mean": 0.0002108716216753237, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002719067706493661, "entropy": 0.20334941148757935, "epoch": 0.01008, "grad_norm": 0.38353925943374634, "learning_rate": 7.000000000000001e-07, "loss": 0.0301, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.322236681124195e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.322236681124195e-05, "completions/clipped_ratio": 0.8515625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 238.40625, "completions/mean_terminated_length": 137.4736785888672, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.28863558173179626, "epoch": 0.01016, "frac_reward_zero_std": 0.25, "grad_norm": 0.5298681855201721, "learning_rate": 7.056e-07, "loss": 0.0277, "num_tokens": 6201524.0, "reward": -1.6477291584014893, "reward_std": 1.0324275493621826, "rewards/reward_fn/mean": -1.6477291584014893, "rewards/reward_fn/std": 2.036229372024536, "step": 127 }, { "clip_ratio/high_max": 0.0015746572462376207, "clip_ratio/high_mean": 0.0005368487545638345, "clip_ratio/low_mean": 0.0003001472941832617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008369960414711386, "entropy": 0.24748222529888153, "epoch": 0.01024, "grad_norm": 0.5044045448303223, "learning_rate": 7.112e-07, "loss": -0.0149, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.11907698959112167, "epoch": 0.01032, "frac_reward_zero_std": 0.625, "grad_norm": 0.3055045008659363, "learning_rate": 7.168000000000001e-07, "loss": -0.0157, "num_tokens": 6299828.0, "reward": -1.8984375, "reward_std": 0.43569135665893555, "rewards/reward_fn/mean": -1.8984375, "rewards/reward_fn/std": 1.4517968893051147, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00018310546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018310546875, "entropy": 0.13225752115249634, "epoch": 0.0104, "grad_norm": 0.29560455679893494, "learning_rate": 7.224e-07, "loss": 0.0157, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 247.90625, "completions/mean_terminated_length": 140.88888549804688, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.19062431901693344, "epoch": 0.01048, "frac_reward_zero_std": 0.5, "grad_norm": 0.525810182094574, "learning_rate": 7.28e-07, "loss": 0.0177, "num_tokens": 6397096.0, "reward": -1.359375, "reward_std": 0.6473738551139832, "rewards/reward_fn/mean": -1.359375, "rewards/reward_fn/std": 1.8389246463775635, "step": 131 }, { "clip_ratio/high_max": 0.0005988024058751762, "clip_ratio/high_mean": 0.00014970060146879405, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014970060146879405, "entropy": 0.20240294188261032, "epoch": 0.01056, "grad_norm": 0.25243842601776123, "learning_rate": 7.336000000000001e-07, "loss": 0.0057, "step": 132 }, { "clip_ratio/high_max": 0.0006006006151437759, "clip_ratio/high_mean": 0.00015015015378594398, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015015015378594398, "completions/clipped_ratio": 0.7265625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 225.78125, "completions/mean_terminated_length": 145.4857177734375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3720102906227112, "epoch": 0.01064, "frac_reward_zero_std": 0.25, "grad_norm": 0.6292359232902527, "learning_rate": 7.392e-07, "loss": 0.0672, "num_tokens": 6491532.0, "reward": -1.3828125, "reward_std": 0.8040870428085327, "rewards/reward_fn/mean": -1.3828125, "rewards/reward_fn/std": 1.8405797481536865, "step": 133 }, { "clip_ratio/high_max": 0.0024326028069481254, "clip_ratio/high_mean": 0.00113281374797225, "clip_ratio/low_mean": 0.0004368906666059047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015697044436819851, "entropy": 0.34222665429115295, "epoch": 0.01072, "grad_norm": 0.6307878494262695, "learning_rate": 7.447999999999999e-07, "loss": -0.0339, "step": 134 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 242.96875, "completions/mean_terminated_length": 151.75, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3049474209547043, "epoch": 0.0108, "frac_reward_zero_std": 0.375, "grad_norm": 0.5817902684211731, "learning_rate": 7.504e-07, "loss": 0.0413, "num_tokens": 6588168.0, "reward": -0.2578125, "reward_std": 1.0959625244140625, "rewards/reward_fn/mean": -0.2578125, "rewards/reward_fn/std": 2.163571357727051, "step": 135 }, { "clip_ratio/high_max": 0.0018722150125540793, "clip_ratio/high_mean": 0.0006035823607817292, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006035823607817292, "entropy": 0.2439945489168167, "epoch": 0.01088, "grad_norm": 0.4169436991214752, "learning_rate": 7.559999999999999e-07, "loss": -0.0338, "step": 136 }, { "clip_ratio/high_max": 0.0006306591094471514, "clip_ratio/high_mean": 0.00022508657275466248, "clip_ratio/low_mean": 0.00014731880219187587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037240536039462313, "completions/clipped_ratio": 0.828125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 236.7265625, "completions/mean_terminated_length": 143.8636474609375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2834816724061966, "epoch": 0.01096, "frac_reward_zero_std": 0.375, "grad_norm": 0.5871475338935852, "learning_rate": 7.615999999999999e-07, "loss": -0.0228, "num_tokens": 6684005.0, "reward": -1.5453330278396606, "reward_std": 0.8023144602775574, "rewards/reward_fn/mean": -1.5453330278396606, "rewards/reward_fn/std": 1.7260578870773315, "step": 137 }, { "clip_ratio/high_max": 0.0011545513407327235, "clip_ratio/high_mean": 0.00036330928560346365, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036330928560346365, "entropy": 0.28396381437778473, "epoch": 0.01104, "grad_norm": 0.41690918803215027, "learning_rate": 7.672e-07, "loss": 0.0309, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 241.734375, "completions/mean_terminated_length": 103.83333587646484, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.17786864936351776, "epoch": 0.01112, "frac_reward_zero_std": 0.625, "grad_norm": 0.6180710196495056, "learning_rate": 7.727999999999999e-07, "loss": 0.068, "num_tokens": 6780483.0, "reward": -1.40625, "reward_std": 0.5092360377311707, "rewards/reward_fn/mean": -1.40625, "rewards/reward_fn/std": 2.094001531600952, "step": 139 }, { "clip_ratio/high_max": 0.001647150085773319, "clip_ratio/high_mean": 0.000533857848495245, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000533857848495245, "entropy": 0.21495263278484344, "epoch": 0.0112, "grad_norm": 0.2722243368625641, "learning_rate": 7.783999999999999e-07, "loss": -0.0713, "step": 140 }, { "clip_ratio/high_max": 0.0003180661587975919, "clip_ratio/high_mean": 7.951653969939798e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.951653969939798e-05, "completions/clipped_ratio": 0.8046875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 236.8125, "completions/mean_terminated_length": 157.75999450683594, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2515145242214203, "epoch": 0.01128, "frac_reward_zero_std": 0.375, "grad_norm": 0.5297714471817017, "learning_rate": 7.84e-07, "loss": -0.0203, "num_tokens": 6876331.0, "reward": -1.8046875, "reward_std": 0.6119349002838135, "rewards/reward_fn/mean": -1.8046875, "rewards/reward_fn/std": 1.474500298500061, "step": 141 }, { "clip_ratio/high_max": 0.0014388837153092027, "clip_ratio/high_mean": 0.00035972092882730067, "clip_ratio/low_mean": 0.00029631523648276925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006560361944139004, "entropy": 0.22058968245983124, "epoch": 0.01136, "grad_norm": 0.4322398602962494, "learning_rate": 7.895999999999999e-07, "loss": 0.0293, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00012913222599308938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012913222599308938, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 248.90625, "completions/mean_terminated_length": 165.1999969482422, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.1519613042473793, "epoch": 0.01144, "frac_reward_zero_std": 0.5, "grad_norm": 0.5125957131385803, "learning_rate": 7.952e-07, "loss": -0.0475, "num_tokens": 6973727.0, "reward": -1.4453125, "reward_std": 0.5496097803115845, "rewards/reward_fn/mean": -1.4453125, "rewards/reward_fn/std": 1.5613874197006226, "step": 143 }, { "clip_ratio/high_max": 0.000589622650295496, "clip_ratio/high_mean": 0.000208440818823874, "clip_ratio/low_mean": 0.000221108493860811, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000429549312684685, "entropy": 0.18868591636419296, "epoch": 0.01152, "grad_norm": 0.2947496175765991, "learning_rate": 8.008e-07, "loss": 0.0401, "step": 144 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "completions/clipped_ratio": 0.890625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 242.90625, "completions/mean_terminated_length": 136.2857208251953, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.2731300890445709, "epoch": 0.0116, "frac_reward_zero_std": 0.125, "grad_norm": 0.5439943075180054, "learning_rate": 8.063999999999999e-07, "loss": -0.0438, "num_tokens": 7070355.0, "reward": -0.8896886110305786, "reward_std": 1.2437005043029785, "rewards/reward_fn/mean": -0.8896886110305786, "rewards/reward_fn/std": 1.6992108821868896, "step": 145 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0005791524599771947, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007012227724771947, "entropy": 0.2713306397199631, "epoch": 0.01168, "grad_norm": 0.362520694732666, "learning_rate": 8.12e-07, "loss": 0.0341, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 247.03125, "completions/mean_terminated_length": 141.1999969482422, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.14201410114765167, "epoch": 0.01176, "frac_reward_zero_std": 0.75, "grad_norm": 0.12077482044696808, "learning_rate": 8.176e-07, "loss": 0.0049, "num_tokens": 7167511.0, "reward": -1.5864620208740234, "reward_std": 0.25118380784988403, "rewards/reward_fn/mean": -1.5864620208740234, "rewards/reward_fn/std": 1.5129673480987549, "step": 147 }, { "clip_ratio/high_max": 0.0013599080848507583, "clip_ratio/high_mean": 0.0003399770212126896, "clip_ratio/low_mean": 6.28456546110101e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040282266854774207, "entropy": 0.16956514865159988, "epoch": 0.01184, "grad_norm": 0.48293566703796387, "learning_rate": 8.232e-07, "loss": 0.0086, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "entropy": 0.1546899899840355, "epoch": 0.01192, "frac_reward_zero_std": 0.125, "grad_norm": 0.8335914611816406, "learning_rate": 8.288e-07, "loss": -0.0038, "num_tokens": 7265815.0, "reward": -1.6171875, "reward_std": 1.132455825805664, "rewards/reward_fn/mean": -1.6171875, "rewards/reward_fn/std": 1.501291275024414, "step": 149 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00030517578125, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042724609375, "entropy": 0.19781331717967987, "epoch": 0.012, "grad_norm": 0.39618122577667236, "learning_rate": 8.344e-07, "loss": 0.0039, "step": 150 }, { "clip_ratio/high_max": 0.00026014569448307157, "clip_ratio/high_mean": 6.503642362076789e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.503642362076789e-05, "completions/clipped_ratio": 0.828125, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 234.4140625, "completions/mean_terminated_length": 130.40908813476562, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.2232275903224945, "epoch": 0.01208, "frac_reward_zero_std": 0.5, "grad_norm": 0.47036486864089966, "learning_rate": 8.4e-07, "loss": 0.0368, "num_tokens": 7361356.0, "reward": -0.140625, "reward_std": 0.5407092571258545, "rewards/reward_fn/mean": -0.140625, "rewards/reward_fn/std": 2.2227683067321777, "step": 151 }, { "clip_ratio/high_max": 0.0011684221681207418, "clip_ratio/high_mean": 0.0003732217155629769, "clip_ratio/low_mean": 0.00030517578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006783974822610617, "entropy": 0.225350022315979, "epoch": 0.01216, "grad_norm": 0.28390660881996155, "learning_rate": 8.456e-07, "loss": -0.0367, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 255.9375, "completions/mean_terminated_length": 248.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.13980545476078987, "epoch": 0.01224, "frac_reward_zero_std": 0.5, "grad_norm": 0.29820889234542847, "learning_rate": 8.512e-07, "loss": -0.0155, "num_tokens": 7459652.0, "reward": -0.890625, "reward_std": 0.6281307339668274, "rewards/reward_fn/mean": -0.890625, "rewards/reward_fn/std": 1.56856107711792, "step": 153 }, { "clip_ratio/high_max": 0.00146484375, "clip_ratio/high_mean": 0.00048828125, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00054931640625, "entropy": 0.15024221688508987, "epoch": 0.01232, "grad_norm": 0.34274306893348694, "learning_rate": 8.568e-07, "loss": 0.0156, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 239.6953125, "completions/mean_terminated_length": 125.5625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17655454576015472, "epoch": 0.0124, "frac_reward_zero_std": 0.75, "grad_norm": 0.20251396298408508, "learning_rate": 8.624e-07, "loss": 0.0001, "num_tokens": 7555869.0, "reward": -0.84375, "reward_std": 0.24491733312606812, "rewards/reward_fn/mean": -0.84375, "rewards/reward_fn/std": 2.0251858234405518, "step": 155 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "entropy": 0.18429259210824966, "epoch": 0.01248, "grad_norm": 0.19968733191490173, "learning_rate": 8.68e-07, "loss": -0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.030371489236131e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.030371489236131e-05, "completions/clipped_ratio": 0.8359375, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 235.3984375, "completions/mean_terminated_length": 130.42857360839844, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.23863250762224197, "epoch": 0.01256, "frac_reward_zero_std": 0.5, "grad_norm": 0.6645041108131409, "learning_rate": 8.736e-07, "loss": -0.0645, "num_tokens": 7651536.0, "reward": -0.679375171661377, "reward_std": 0.4771604537963867, "rewards/reward_fn/mean": -0.679375171661377, "rewards/reward_fn/std": 2.4170053005218506, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.977025961736217e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.977025961736217e-05, "entropy": 0.27378956228494644, "epoch": 0.01264, "grad_norm": 0.34023985266685486, "learning_rate": 8.791999999999999e-07, "loss": 0.0645, "step": 158 }, { "clip_ratio/high_max": 0.00048828125, "clip_ratio/high_mean": 0.0001220703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001220703125, "completions/clipped_ratio": 0.8515625, "completions/max_length": 256.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 231.4765625, "completions/mean_terminated_length": 90.78947448730469, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.28166326880455017, "epoch": 0.01272, "frac_reward_zero_std": 0.375, "grad_norm": 0.5141552686691284, "learning_rate": 8.848000000000001e-07, "loss": 0.0042, "num_tokens": 7746701.0, "reward": -0.9609375, "reward_std": 0.6542541980743408, "rewards/reward_fn/mean": -0.9609375, "rewards/reward_fn/std": 2.059746742248535, "step": 159 }, { "clip_ratio/high_max": 0.0030907279578968883, "clip_ratio/high_mean": 0.0007726819894742221, "clip_ratio/low_mean": 0.00013165662676328793, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009043386089615524, "entropy": 0.24627144634723663, "epoch": 0.0128, "grad_norm": 0.5340589284896851, "learning_rate": 8.904000000000001e-07, "loss": 0.0119, "step": 160 }, { "clip_ratio/high_max": 0.00028951940475963056, "clip_ratio/high_mean": 7.237985118990764e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.237985118990764e-05, "completions/clipped_ratio": 0.890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.296875, "completions/mean_terminated_length": 121.5714340209961, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.2091401442885399, "epoch": 0.01288, "frac_reward_zero_std": 0.25, "grad_norm": 0.4241023659706116, "learning_rate": 8.96e-07, "loss": -0.0207, "num_tokens": 7843123.0, "reward": -1.546875, "reward_std": 0.9271388053894043, "rewards/reward_fn/mean": -1.546875, "rewards/reward_fn/std": 1.596548318862915, "step": 161 }, { "clip_ratio/high_max": 0.0013968783896416426, "clip_ratio/high_mean": 0.0005323250661604106, "clip_ratio/low_mean": 0.0005488699243869632, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010811950196512043, "entropy": 0.21888738870620728, "epoch": 0.01296, "grad_norm": 0.44768524169921875, "learning_rate": 9.016e-07, "loss": 0.0281, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 255.484375, "completions/mean_terminated_length": 190.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.1723433956503868, "epoch": 0.01304, "frac_reward_zero_std": 0.5, "grad_norm": 0.2550674080848694, "learning_rate": 9.072e-07, "loss": 0.0029, "num_tokens": 7941361.0, "reward": -1.4987605810165405, "reward_std": 0.6680183410644531, "rewards/reward_fn/mean": -1.4987605810165405, "rewards/reward_fn/std": 1.5072029829025269, "step": 163 }, { "clip_ratio/high_max": 0.0009765625, "clip_ratio/high_mean": 0.000244140625, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003662109375, "entropy": 0.16882961243391037, "epoch": 0.01312, "grad_norm": 0.34651950001716614, "learning_rate": 9.127999999999999e-07, "loss": -0.0019, "step": 164 }, { "clip_ratio/high_max": 0.0005367672711145133, "clip_ratio/high_mean": 0.00013419181777862832, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013419181777862832, "completions/clipped_ratio": 0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 234.34375, "completions/mean_terminated_length": 140.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.2726888954639435, "epoch": 0.0132, "frac_reward_zero_std": 0.125, "grad_norm": 0.683080792427063, "learning_rate": 9.184000000000001e-07, "loss": -0.0049, "num_tokens": 8036893.0, "reward": -0.8410792350769043, "reward_std": 1.1305444240570068, "rewards/reward_fn/mean": -0.8410792350769043, "rewards/reward_fn/std": 1.7907743453979492, "step": 165 }, { "clip_ratio/high_max": 0.0011627169733401388, "clip_ratio/high_mean": 0.0005742482826462947, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005742482826462947, "entropy": 0.2741318941116333, "epoch": 0.01328, "grad_norm": 0.5230721235275269, "learning_rate": 9.240000000000001e-07, "loss": 0.0131, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 176.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.19001999497413635, "epoch": 0.01336, "frac_reward_zero_std": 0.25, "grad_norm": 0.45260652899742126, "learning_rate": 9.296e-07, "loss": 0.0434, "num_tokens": 8135117.0, "reward": -1.1953125, "reward_std": 0.9005447626113892, "rewards/reward_fn/mean": -1.1953125, "rewards/reward_fn/std": 1.474500298500061, "step": 167 }, { "clip_ratio/high_max": 0.0009765625, "clip_ratio/high_mean": 0.00030765688279643655, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042972719529643655, "entropy": 0.20105057209730148, "epoch": 0.01344, "grad_norm": 0.4466783404350281, "learning_rate": 9.352e-07, "loss": -0.0392, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 255.5234375, "completions/mean_terminated_length": 195.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.17403801530599594, "epoch": 0.01352, "frac_reward_zero_std": 0.375, "grad_norm": 0.5028333067893982, "learning_rate": 9.408e-07, "loss": -0.0352, "num_tokens": 8233360.0, "reward": -1.921875, "reward_std": 0.6179215312004089, "rewards/reward_fn/mean": -1.921875, "rewards/reward_fn/std": 1.5400652885437012, "step": 169 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.000244140625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000244140625, "entropy": 0.1511266604065895, "epoch": 0.0136, "grad_norm": 0.35953035950660706, "learning_rate": 9.463999999999998e-07, "loss": 0.0423, "step": 170 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515625e-05, "completions/clipped_ratio": 0.8515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.5546875, "completions/mean_terminated_length": 158.6842041015625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.24161381274461746, "epoch": 0.01368, "frac_reward_zero_std": 0.25, "grad_norm": 0.5807982087135315, "learning_rate": 9.520000000000001e-07, "loss": -0.0515, "num_tokens": 8329815.0, "reward": -0.48249077796936035, "reward_std": 0.6717150807380676, "rewards/reward_fn/mean": -0.48249074816703796, "rewards/reward_fn/std": 2.2856316566467285, "step": 171 }, { "clip_ratio/high_max": 0.0014549685874953866, "clip_ratio/high_mean": 0.0006757222290616482, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007367573853116482, "entropy": 0.22436504065990448, "epoch": 0.01376, "grad_norm": 0.546265721321106, "learning_rate": 9.576e-07, "loss": 0.061, "step": 172 }, { "clip_ratio/high_max": 0.00025239778915420175, "clip_ratio/high_mean": 6.309944728855044e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.309944728855044e-05, "completions/clipped_ratio": 0.9921875, "completions/max_length": 256.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 255.4765625, "completions/mean_terminated_length": 189.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.17850255221128464, "epoch": 0.01384, "frac_reward_zero_std": 0.375, "grad_norm": 0.3855346441268921, "learning_rate": 9.632e-07, "loss": 0.033, "num_tokens": 8428052.0, "reward": -1.546875, "reward_std": 0.7631022334098816, "rewards/reward_fn/mean": -1.546875, "rewards/reward_fn/std": 1.551526427268982, "step": 173 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.00018310546875, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000244140625, "entropy": 0.21059687435626984, "epoch": 0.01392, "grad_norm": 0.4318319857120514, "learning_rate": 9.688e-07, "loss": -0.0288, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 240.1484375, "completions/mean_terminated_length": 99.92308044433594, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.214984729886055, "epoch": 0.014, "frac_reward_zero_std": 0.375, "grad_norm": 0.37443095445632935, "learning_rate": 9.744e-07, "loss": 0.0195, "num_tokens": 8524327.0, "reward": -1.7109375, "reward_std": 0.6346870064735413, "rewards/reward_fn/mean": -1.7109375, "rewards/reward_fn/std": 1.5831412076950073, "step": 175 }, { "clip_ratio/high_max": 0.00029019152862019837, "clip_ratio/high_mean": 7.254788215504959e-05, "clip_ratio/low_mean": 6.103515625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001335830384050496, "entropy": 0.20071523636579514, "epoch": 0.01408, "grad_norm": 0.43105214834213257, "learning_rate": 9.800000000000001e-07, "loss": -0.018, "step": 176 }, { "clip_ratio/high_max": 0.00025786488549783826, "clip_ratio/high_mean": 6.446622137445956e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.446622137445956e-05, "completions/clipped_ratio": 0.9296875, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 248.3359375, "completions/mean_terminated_length": 147.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.23165735602378845, "epoch": 0.01416, "frac_reward_zero_std": 0.125, "grad_norm": 0.6483467221260071, "learning_rate": 9.856000000000001e-07, "loss": 0.0603, "num_tokens": 8621650.0, "reward": -0.7667518854141235, "reward_std": 0.9230960607528687, "rewards/reward_fn/mean": -0.7667518854141235, "rewards/reward_fn/std": 1.9685657024383545, "step": 177 }, { "clip_ratio/high_max": 0.0024546069325879216, "clip_ratio/high_mean": 0.0009188275435008109, "clip_ratio/low_mean": 0.00020512737683020532, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011239548912271857, "entropy": 0.24869053810834885, "epoch": 0.01424, "grad_norm": 0.5821310877799988, "learning_rate": 9.912e-07, "loss": -0.0655, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.3046875, "completions/mean_terminated_length": 129.3928680419922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.29644716531038284, "epoch": 0.01432, "frac_reward_zero_std": 0.25, "grad_norm": 0.5408415794372559, "learning_rate": 9.968e-07, "loss": 0.0563, "num_tokens": 8716409.0, "reward": -1.6528704166412354, "reward_std": 0.8270192742347717, "rewards/reward_fn/mean": -1.6528704166412354, "rewards/reward_fn/std": 1.6465967893600464, "step": 179 }, { "clip_ratio/high_max": 0.0012771391775459051, "clip_ratio/high_mean": 0.00046599842607975006, "clip_ratio/low_mean": 0.0004885889065917581, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009545873035676777, "entropy": 0.28542880713939667, "epoch": 0.0144, "grad_norm": 0.7492707371711731, "learning_rate": 1.0024e-06, "loss": -0.0771, "step": 180 }, { "clip_ratio/high_max": 0.0002900231920648366, "clip_ratio/high_mean": 7.250579801620916e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.250579801620916e-05, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 242.2734375, "completions/mean_terminated_length": 120.84616088867188, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.24518371373414993, "epoch": 0.01448, "frac_reward_zero_std": 0.125, "grad_norm": 0.4695487320423126, "learning_rate": 1.008e-06, "loss": -0.052, "num_tokens": 8812956.0, "reward": -0.984375, "reward_std": 0.976152777671814, "rewards/reward_fn/mean": -0.984375, "rewards/reward_fn/std": 1.7702012062072754, "step": 181 }, { "clip_ratio/high_max": 0.0007374805281870067, "clip_ratio/high_mean": 0.0004536230699159205, "clip_ratio/low_mean": 0.000383656399208121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008372794836759567, "entropy": 0.26895327866077423, "epoch": 0.01456, "grad_norm": 0.7007917761802673, "learning_rate": 1.0136000000000001e-06, "loss": 0.0463, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.589351687580347e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.589351687580347e-05, "completions/clipped_ratio": 0.8984375, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 242.6015625, "completions/mean_terminated_length": 124.0769271850586, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2356422394514084, "epoch": 0.01464, "frac_reward_zero_std": 0.25, "grad_norm": 0.5134413838386536, "learning_rate": 1.0192000000000001e-06, "loss": -0.0177, "num_tokens": 8909545.0, "reward": -1.1225003004074097, "reward_std": 0.8265896439552307, "rewards/reward_fn/mean": -1.1225003004074097, "rewards/reward_fn/std": 2.097477912902832, "step": 183 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.000244140625, "clip_ratio/low_mean": 0.0001387347438139841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003828753833658993, "entropy": 0.23884353786706924, "epoch": 0.01472, "grad_norm": 0.397490531206131, "learning_rate": 1.0248e-06, "loss": 0.0258, "step": 184 }, { "clip_ratio/high_max": 0.0007372372783720493, "clip_ratio/high_mean": 0.00018430931959301233, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018430931959301233, "completions/clipped_ratio": 0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 242.046875, "completions/mean_terminated_length": 144.375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.19886889308691025, "epoch": 0.0148, "frac_reward_zero_std": 0.25, "grad_norm": 0.5873580574989319, "learning_rate": 1.0304e-06, "loss": -0.0002, "num_tokens": 9006063.0, "reward": -1.078125, "reward_std": 0.7607901096343994, "rewards/reward_fn/mean": -1.078125, "rewards/reward_fn/std": 1.5400652885437012, "step": 185 }, { "clip_ratio/high_max": 0.000732421875, "clip_ratio/high_mean": 0.0003397864056751132, "clip_ratio/low_mean": 0.0005804247921332717, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009202111978083849, "entropy": 0.24189598858356476, "epoch": 0.01488, "grad_norm": 0.3199136257171631, "learning_rate": 1.036e-06, "loss": 0.0014, "step": 186 }, { "clip_ratio/high_max": 0.0009765625, "clip_ratio/high_mean": 0.000244140625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000244140625, "completions/clipped_ratio": 0.921875, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 249.2734375, "completions/mean_terminated_length": 169.90000915527344, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.23669108003377914, "epoch": 0.01496, "frac_reward_zero_std": 0.25, "grad_norm": 0.4339716136455536, "learning_rate": 1.0416e-06, "loss": 0.0098, "num_tokens": 9103506.0, "reward": -0.7734375, "reward_std": 1.0956165790557861, "rewards/reward_fn/mean": -0.7734375, "rewards/reward_fn/std": 1.735297441482544, "step": 187 }, { "clip_ratio/high_max": 0.0024800754617899656, "clip_ratio/high_mean": 0.001236710580997169, "clip_ratio/low_mean": 0.0005508125177584589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017875230987556279, "entropy": 0.24907439947128296, "epoch": 0.01504, "grad_norm": 0.6542237401008606, "learning_rate": 1.0472000000000001e-06, "loss": -0.0013, "step": 188 }, { "clip_ratio/high_max": 0.0011482808913569897, "clip_ratio/high_mean": 0.00040914053533924744, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040914053533924744, "completions/clipped_ratio": 0.8046875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.859375, "completions/mean_terminated_length": 147.75999450683594, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2788490578532219, "epoch": 0.01512, "frac_reward_zero_std": 0.0, "grad_norm": 0.8044178485870361, "learning_rate": 1.0528000000000001e-06, "loss": 0.0531, "num_tokens": 9199104.0, "reward": -1.0943706035614014, "reward_std": 0.9164309501647949, "rewards/reward_fn/mean": -1.0943706035614014, "rewards/reward_fn/std": 1.4857969284057617, "step": 189 }, { "clip_ratio/high_max": 0.0034432302927598357, "clip_ratio/high_mean": 0.0016752941883169115, "clip_ratio/low_mean": 0.0006124031206127256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022876972798258066, "entropy": 0.2656025141477585, "epoch": 0.0152, "grad_norm": 0.7183839082717896, "learning_rate": 1.0584e-06, "loss": -0.0657, "step": 190 }, { "clip_ratio/high_max": 0.00032851510331965983, "clip_ratio/high_mean": 8.212877582991496e-05, "clip_ratio/low_mean": 0.00014913758786860853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002312663709744811, "completions/clipped_ratio": 0.6640625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 209.1640625, "completions/mean_terminated_length": 116.5813980102539, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.2810486704111099, "epoch": 0.01528, "frac_reward_zero_std": 0.25, "grad_norm": 0.6677382588386536, "learning_rate": 1.0639999999999999e-06, "loss": -0.0075, "num_tokens": 9291413.0, "reward": 0.1171875, "reward_std": 0.8898546099662781, "rewards/reward_fn/mean": 0.1171875, "rewards/reward_fn/std": 2.1430022716522217, "step": 191 }, { "clip_ratio/high_max": 0.0035894878674298525, "clip_ratio/high_mean": 0.001161246036645025, "clip_ratio/low_mean": 0.0005623815959552303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001723627676256001, "entropy": 0.2911089360713959, "epoch": 0.01536, "grad_norm": 0.8836938142776489, "learning_rate": 1.0695999999999999e-06, "loss": -0.001, "step": 192 }, { "clip_ratio/high_max": 0.0006825938471592963, "clip_ratio/high_mean": 0.00023358904581982642, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023358904581982642, "completions/clipped_ratio": 0.7109375, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 220.1796875, "completions/mean_terminated_length": 132.08108520507812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.3276675343513489, "epoch": 0.01544, "frac_reward_zero_std": 0.0, "grad_norm": 0.8173961639404297, "learning_rate": 1.0751999999999999e-06, "loss": 0.001, "num_tokens": 9385132.0, "reward": -0.5576639175415039, "reward_std": 1.3826322555541992, "rewards/reward_fn/mean": -0.5576639175415039, "rewards/reward_fn/std": 1.887349009513855, "step": 193 }, { "clip_ratio/high_max": 0.0032186461612582207, "clip_ratio/high_mean": 0.0011496018851175904, "clip_ratio/low_mean": 0.0003248919820180163, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001474493881687522, "entropy": 0.3491624742746353, "epoch": 0.01552, "grad_norm": 0.6649034023284912, "learning_rate": 1.0808e-06, "loss": 0.0139, "step": 194 }, { "clip_ratio/high_max": 0.0005237826844677329, "clip_ratio/high_mean": 0.00013094567111693323, "clip_ratio/low_mean": 0.0001220703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002530159836169332, "completions/clipped_ratio": 0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 231.5234375, "completions/mean_terminated_length": 158.09375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.27145126461982727, "epoch": 0.0156, "frac_reward_zero_std": 0.125, "grad_norm": 0.7575114369392395, "learning_rate": 1.0864e-06, "loss": 0.0159, "num_tokens": 9480303.0, "reward": -1.0546875, "reward_std": 1.0169532299041748, "rewards/reward_fn/mean": -1.0546875, "rewards/reward_fn/std": 1.4380027055740356, "step": 195 }, { "clip_ratio/high_max": 0.0049878989811986685, "clip_ratio/high_mean": 0.0020302943303249776, "clip_ratio/low_mean": 0.00030517578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023354701115749776, "entropy": 0.32893580198287964, "epoch": 0.01568, "grad_norm": 0.5413504242897034, "learning_rate": 1.0919999999999999e-06, "loss": 0.0033, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 248.1328125, "completions/mean_terminated_length": 203.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.29076580703258514, "epoch": 0.01576, "frac_reward_zero_std": 0.125, "grad_norm": 0.6498634219169617, "learning_rate": 1.0975999999999999e-06, "loss": 0.0569, "num_tokens": 9577600.0, "reward": -0.5390625, "reward_std": 0.8793766498565674, "rewards/reward_fn/mean": -0.5390625, "rewards/reward_fn/std": 1.2160497903823853, "step": 197 }, { "clip_ratio/high_max": 0.0038424389204010367, "clip_ratio/high_mean": 0.0017540667904540896, "clip_ratio/low_mean": 0.0009411872597411275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026952539337798953, "entropy": 0.2787573039531708, "epoch": 0.01584, "grad_norm": 0.5501229166984558, "learning_rate": 1.1031999999999999e-06, "loss": -0.0438, "step": 198 }, { "clip_ratio/high_max": 0.0007642775890417397, "clip_ratio/high_mean": 0.00019106939726043493, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019106939726043493, "completions/clipped_ratio": 0.6328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.734375, "completions/mean_terminated_length": 176.29786682128906, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.3573096990585327, "epoch": 0.01592, "frac_reward_zero_std": 0.125, "grad_norm": 0.749981164932251, "learning_rate": 1.1088e-06, "loss": -0.0311, "num_tokens": 9672158.0, "reward": -0.2109375, "reward_std": 1.0149574279785156, "rewards/reward_fn/mean": -0.2109375, "rewards/reward_fn/std": 1.6913505792617798, "step": 199 }, { "clip_ratio/high_max": 0.006471493048593402, "clip_ratio/high_mean": 0.0039364692056551576, "clip_ratio/low_mean": 0.0001307531347265467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004067222238518298, "entropy": 0.353684663772583, "epoch": 0.016, "grad_norm": 0.8618205785751343, "learning_rate": 1.1144e-06, "loss": 0.0595, "step": 200 }, { "clip_ratio/high_max": 0.000244140625, "clip_ratio/high_mean": 6.103515625e-05, "clip_ratio/low_mean": 6.838074477855116e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012941590102855116, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.3046875, "completions/mean_terminated_length": 177.0757598876953, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3608485758304596, "epoch": 0.01608, "frac_reward_zero_std": 0.5, "grad_norm": 0.4605451822280884, "learning_rate": 1.12e-06, "loss": 0.0594, "num_tokens": 9765253.0, "reward": -0.0234375, "reward_std": 0.4093368649482727, "rewards/reward_fn/mean": -0.0234375, "rewards/reward_fn/std": 0.59479159116745, "step": 201 }, { "clip_ratio/high_max": 0.00670200539752841, "clip_ratio/high_mean": 0.0029973165364935994, "clip_ratio/low_mean": 0.0015056956908665597, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004503012169152498, "entropy": 0.3769911825656891, "epoch": 0.01616, "grad_norm": 0.7966389060020447, "learning_rate": 1.1256e-06, "loss": -0.0535, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 196.890625, "completions/mean_terminated_length": 160.2278594970703, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.3618808388710022, "epoch": 0.01624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1311999999999999e-06, "loss": 0.0, "num_tokens": 9855991.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.331061452627182, "epoch": 0.01632, "grad_norm": 0.0, "learning_rate": 1.1367999999999999e-06, "loss": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.84375, "completions/mean_terminated_length": 200.15789794921875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32358674705028534, "epoch": 0.0164, "frac_reward_zero_std": 0.5, "grad_norm": 0.7151842713356018, "learning_rate": 1.1424e-06, "loss": 0.0066, "num_tokens": 9950051.0, "reward": -0.16234397888183594, "reward_std": 0.4499809741973877, "rewards/reward_fn/mean": -0.16234397888183594, "rewards/reward_fn/std": 0.6852259635925293, "step": 205 }, { "clip_ratio/high_max": 0.006708669243380427, "clip_ratio/high_mean": 0.002798927715048194, "clip_ratio/low_mean": 0.00036381802055984735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031627456191927195, "entropy": 0.3197632133960724, "epoch": 0.01648, "grad_norm": 0.6870349645614624, "learning_rate": 1.148e-06, "loss": 0.0127, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.9609375, "completions/mean_terminated_length": 184.32142639160156, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3052171617746353, "epoch": 0.01656, "frac_reward_zero_std": 0.75, "grad_norm": 0.4157187342643738, "learning_rate": 1.1536e-06, "loss": 0.0082, "num_tokens": 10042334.0, "reward": 0.45131343603134155, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.45131343603134155, "rewards/reward_fn/std": 1.0920906066894531, "step": 207 }, { "clip_ratio/high_max": 0.0033955994294956326, "clip_ratio/high_mean": 0.0015873978263698518, "clip_ratio/low_mean": 8.26719551696442e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001670069817919284, "entropy": 0.2936283349990845, "epoch": 0.01664, "grad_norm": 0.45428013801574707, "learning_rate": 1.1592e-06, "loss": -0.0004, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 222.796875, "completions/mean_terminated_length": 196.97222900390625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2704962342977524, "epoch": 0.01672, "frac_reward_zero_std": 0.625, "grad_norm": 0.7537325620651245, "learning_rate": 1.1647999999999998e-06, "loss": 0.025, "num_tokens": 10136388.0, "reward": 0.3046875, "reward_std": 0.28125, "rewards/reward_fn/mean": 0.3046875, "rewards/reward_fn/std": 1.1193262338638306, "step": 209 }, { "clip_ratio/high_max": 0.007405856391415, "clip_ratio/high_mean": 0.0024966762866824865, "clip_ratio/low_mean": 0.00019851773686241359, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026951940963044763, "entropy": 0.26198112964630127, "epoch": 0.0168, "grad_norm": 1.1515355110168457, "learning_rate": 1.1703999999999998e-06, "loss": -0.0154, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 183.90625, "completions/mean_terminated_length": 163.72000122070312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.28853215277194977, "epoch": 0.01688, "frac_reward_zero_std": 0.625, "grad_norm": 0.8170933127403259, "learning_rate": 1.176e-06, "loss": -0.0463, "num_tokens": 10225464.0, "reward": 1.1152360439300537, "reward_std": 0.33099275827407837, "rewards/reward_fn/mean": 1.1152360439300537, "rewards/reward_fn/std": 1.4352660179138184, "step": 211 }, { "clip_ratio/high_max": 0.00409234594553709, "clip_ratio/high_mean": 0.001664319192059338, "clip_ratio/low_mean": 0.0016643868875689805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033287060214206576, "entropy": 0.29568228125572205, "epoch": 0.01696, "grad_norm": 0.5702215433120728, "learning_rate": 1.1816e-06, "loss": 0.0392, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 196.09375, "completions/mean_terminated_length": 169.8426971435547, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.25143909454345703, "epoch": 0.01704, "frac_reward_zero_std": 0.875, "grad_norm": 0.2743165194988251, "learning_rate": 1.1872e-06, "loss": 0.0262, "num_tokens": 10316100.0, "reward": 0.0703125, "reward_std": 0.15116733312606812, "rewards/reward_fn/mean": 0.0703125, "rewards/reward_fn/std": 0.45564860105514526, "step": 213 }, { "clip_ratio/high_max": 0.00136239780113101, "clip_ratio/high_mean": 0.0003405994502827525, "clip_ratio/low_mean": 0.00017029972514137626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005108991754241288, "entropy": 0.2659657299518585, "epoch": 0.01712, "grad_norm": 0.5337011814117432, "learning_rate": 1.1928e-06, "loss": -0.0221, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.5546875, "completions/mean_terminated_length": 193.55172729492188, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.27661439776420593, "epoch": 0.0172, "frac_reward_zero_std": 0.875, "grad_norm": 0.535425066947937, "learning_rate": 1.1983999999999998e-06, "loss": 0.0235, "num_tokens": 10408971.0, "reward": 0.3515625, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.3515625, "rewards/reward_fn/std": 1.0393050909042358, "step": 215 }, { "clip_ratio/high_max": 0.004765416262671351, "clip_ratio/high_mean": 0.0011913540656678379, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011913540656678379, "entropy": 0.22978316992521286, "epoch": 0.01728, "grad_norm": 0.10747747868299484, "learning_rate": 1.2039999999999998e-06, "loss": -0.0189, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.2734375, "completions/mean_terminated_length": 181.11764526367188, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.24962922930717468, "epoch": 0.01736, "frac_reward_zero_std": 0.875, "grad_norm": 0.2139553427696228, "learning_rate": 1.2096e-06, "loss": -0.0196, "num_tokens": 10500910.0, "reward": 0.703125, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.703125, "rewards/reward_fn/std": 1.3824511766433716, "step": 217 }, { "clip_ratio/high_max": 0.0030633690766990185, "clip_ratio/high_mean": 0.0007658422691747546, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007658422691747546, "entropy": 0.26026245951652527, "epoch": 0.01744, "grad_norm": 0.44410285353660583, "learning_rate": 1.2152e-06, "loss": 0.01, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 201.4921875, "completions/mean_terminated_length": 179.32968139648438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.25299958884716034, "epoch": 0.01752, "frac_reward_zero_std": 0.875, "grad_norm": 0.22851069271564484, "learning_rate": 1.2208e-06, "loss": 0.0146, "num_tokens": 10592237.0, "reward": 0.046875, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.046875, "rewards/reward_fn/std": 0.37352070212364197, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007330186635954306, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007330186635954306, "entropy": 0.28550006449222565, "epoch": 0.0176, "grad_norm": 0.30740663409233093, "learning_rate": 1.2264e-06, "loss": 0.0009, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 198.234375, "completions/mean_terminated_length": 171.01148986816406, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.24287106096744537, "epoch": 0.01768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2319999999999998e-06, "loss": 0.0, "num_tokens": 10683147.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24996797740459442, "epoch": 0.01776, "grad_norm": 0.0, "learning_rate": 1.2376e-06, "loss": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 207.6171875, "completions/mean_terminated_length": 191.48959350585938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.250299409031868, "epoch": 0.01784, "frac_reward_zero_std": 0.875, "grad_norm": 0.5217434763908386, "learning_rate": 1.2432e-06, "loss": -0.0153, "num_tokens": 10775258.0, "reward": 0.421875, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.421875, "rewards/reward_fn/std": 1.047000765800476, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012263250100659207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012263250100659207, "entropy": 0.25879286974668503, "epoch": 0.01792, "grad_norm": 0.2021351158618927, "learning_rate": 1.2488e-06, "loss": 0.0209, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.09375, "completions/mean_terminated_length": 195.9058837890625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.24389813840389252, "epoch": 0.018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2544e-06, "loss": 0.0, "num_tokens": 10868454.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22992303222417831, "epoch": 0.01808, "grad_norm": 0.0, "learning_rate": 1.26e-06, "loss": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.8984375, "completions/mean_terminated_length": 187.03750610351562, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.21093931049108505, "epoch": 0.01816, "frac_reward_zero_std": 0.625, "grad_norm": 1.03831946849823, "learning_rate": 1.2656e-06, "loss": 0.0591, "num_tokens": 10961241.0, "reward": 0.375, "reward_std": 0.3155868649482727, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 1.064827799797058, "step": 227 }, { "clip_ratio/high_max": 0.005158339627087116, "clip_ratio/high_mean": 0.002288350136950612, "clip_ratio/low_mean": 0.0017979448311962187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00408629490993917, "entropy": 0.21694151312112808, "epoch": 0.01824, "grad_norm": 0.450391560792923, "learning_rate": 1.2712e-06, "loss": -0.0381, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 192.421875, "completions/mean_terminated_length": 167.54348754882812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.2342996746301651, "epoch": 0.01832, "frac_reward_zero_std": 0.75, "grad_norm": 0.6899523735046387, "learning_rate": 1.2768e-06, "loss": 0.003, "num_tokens": 11051407.0, "reward": 0.9590879082679749, "reward_std": 0.2020762413740158, "rewards/reward_fn/mean": 0.9590879082679749, "rewards/reward_fn/std": 1.3826957941055298, "step": 229 }, { "clip_ratio/high_max": 0.006827795645222068, "clip_ratio/high_mean": 0.0023809807607904077, "clip_ratio/low_mean": 0.0007303081074496731, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031112887081690133, "entropy": 0.2690436840057373, "epoch": 0.0184, "grad_norm": 0.6946871280670166, "learning_rate": 1.2824e-06, "loss": 0.0004, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.046875, "completions/mean_terminated_length": 188.1234588623047, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.2073386311531067, "epoch": 0.01848, "frac_reward_zero_std": 0.875, "grad_norm": 0.8255804181098938, "learning_rate": 1.288e-06, "loss": -0.015, "num_tokens": 11144213.0, "reward": 0.0007786562200635672, "reward_std": 0.0031146248802542686, "rewards/reward_fn/mean": 0.0007786562200635672, "rewards/reward_fn/std": 0.008809489198029041, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001080018628272228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001080018628272228, "entropy": 0.23826906085014343, "epoch": 0.01856, "grad_norm": 0.10162186622619629, "learning_rate": 1.2936e-06, "loss": 0.0128, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 193.0859375, "completions/mean_terminated_length": 171.23158264160156, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.23083802312612534, "epoch": 0.01864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.2992e-06, "loss": 0.0, "num_tokens": 11234464.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21717363595962524, "epoch": 0.01872, "grad_norm": 0.0, "learning_rate": 1.3048e-06, "loss": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.6171875, "completions/mean_terminated_length": 185.32142639160156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.20076970010995865, "epoch": 0.0188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3104e-06, "loss": 0.0, "num_tokens": 11326831.0, "reward": 0.07061244547367096, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07061244547367096, "rewards/reward_fn/std": 0.18755705654621124, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20080925524234772, "epoch": 0.01888, "grad_norm": 0.0, "learning_rate": 1.316e-06, "loss": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 197.609375, "completions/mean_terminated_length": 182.72549438476562, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.2350926771759987, "epoch": 0.01896, "frac_reward_zero_std": 0.75, "grad_norm": 0.6189565658569336, "learning_rate": 1.3216e-06, "loss": -0.0067, "num_tokens": 11417661.0, "reward": 1.265625, "reward_std": 0.27326756715774536, "rewards/reward_fn/mean": 1.265625, "rewards/reward_fn/std": 1.4873977899551392, "step": 237 }, { "clip_ratio/high_max": 0.000885024550370872, "clip_ratio/high_mean": 0.000221256137592718, "clip_ratio/low_mean": 0.0014993723598308861, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017206285265274346, "entropy": 0.21956052631139755, "epoch": 0.01904, "grad_norm": 0.6889275312423706, "learning_rate": 1.3272e-06, "loss": 0.0013, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.2890625, "completions/mean_terminated_length": 191.69662475585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22114653140306473, "epoch": 0.01912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3328e-06, "loss": 0.0, "num_tokens": 11510242.0, "reward": 0.4633024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4633024334907532, "rewards/reward_fn/std": 0.9901458024978638, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21622156351804733, "epoch": 0.0192, "grad_norm": 0.0, "learning_rate": 1.3384e-06, "loss": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 199.578125, "completions/mean_terminated_length": 180.77084350585938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.20501133799552917, "epoch": 0.01928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.344e-06, "loss": 0.0, "num_tokens": 11601324.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20278869569301605, "epoch": 0.01936, "grad_norm": 0.0, "learning_rate": 1.3496e-06, "loss": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 211.6875, "completions/mean_terminated_length": 190.80459594726562, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.2154131829738617, "epoch": 0.01944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3552e-06, "loss": 0.0, "num_tokens": 11693956.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21194593608379364, "epoch": 0.01952, "grad_norm": 0.0, "learning_rate": 1.3608e-06, "loss": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.28125, "completions/mean_terminated_length": 181.61798095703125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.22167643904685974, "epoch": 0.0196, "frac_reward_zero_std": 0.75, "grad_norm": 0.6284977793693542, "learning_rate": 1.3664000000000002e-06, "loss": 0.0134, "num_tokens": 11785640.0, "reward": 0.9609375, "reward_std": 0.39608466625213623, "rewards/reward_fn/mean": 0.9609375, "rewards/reward_fn/std": 1.5942927598953247, "step": 245 }, { "clip_ratio/high_max": 0.008481380995362997, "clip_ratio/high_mean": 0.003342945477925241, "clip_ratio/low_mean": 0.00042344172834418714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037663872353732586, "entropy": 0.21468418091535568, "epoch": 0.01968, "grad_norm": 0.43391671776771545, "learning_rate": 1.372e-06, "loss": -0.0166, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.7578125, "completions/mean_terminated_length": 192.3333282470703, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.1854093223810196, "epoch": 0.01976, "frac_reward_zero_std": 0.75, "grad_norm": 0.929018497467041, "learning_rate": 1.3776e-06, "loss": 0.0351, "num_tokens": 11877641.0, "reward": 0.15260423719882965, "reward_std": 0.18565356731414795, "rewards/reward_fn/mean": 0.15260423719882965, "rewards/reward_fn/std": 0.5829187035560608, "step": 247 }, { "clip_ratio/high_max": 0.006596652325242758, "clip_ratio/high_mean": 0.0023334696306847036, "clip_ratio/low_mean": 0.00029733250266872346, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026308021042495966, "entropy": 0.19523750245571136, "epoch": 0.01984, "grad_norm": 0.417076051235199, "learning_rate": 1.3832e-06, "loss": -0.0368, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 192.171875, "completions/mean_terminated_length": 181.04586791992188, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.21449711173772812, "epoch": 0.01992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3888e-06, "loss": 0.0, "num_tokens": 11967775.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20677679032087326, "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 1.3944e-06, "loss": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.2265625, "completions/mean_terminated_length": 192.86915588378906, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.19357240200042725, "epoch": 0.02008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "num_tokens": 12059324.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19241608679294586, "epoch": 0.02016, "grad_norm": 0.0, "learning_rate": 1.4056e-06, "loss": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 196.7265625, "completions/mean_terminated_length": 180.8811798095703, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.20543163269758224, "epoch": 0.02024, "frac_reward_zero_std": 0.875, "grad_norm": 0.12329332530498505, "learning_rate": 1.4112e-06, "loss": -0.0131, "num_tokens": 12150041.0, "reward": 0.3515625, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.3515625, "rewards/reward_fn/std": 0.9687222242355347, "step": 253 }, { "clip_ratio/high_max": 0.0021630129776895046, "clip_ratio/high_mean": 0.0005407532444223762, "clip_ratio/low_mean": 7.427213131450117e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006150253757368773, "entropy": 0.1874091476202011, "epoch": 0.02032, "grad_norm": 0.4647635519504547, "learning_rate": 1.4168e-06, "loss": 0.0203, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 186.42201232910156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.19443897902965546, "epoch": 0.0204, "frac_reward_zero_std": 0.75, "grad_norm": 1.0549484491348267, "learning_rate": 1.4224e-06, "loss": 0.0206, "num_tokens": 12240761.0, "reward": 0.4679545760154724, "reward_std": 0.22577203810214996, "rewards/reward_fn/mean": 0.4679545760154724, "rewards/reward_fn/std": 1.0344632863998413, "step": 255 }, { "clip_ratio/high_max": 0.009061843622475863, "clip_ratio/high_mean": 0.0022654609056189656, "clip_ratio/low_mean": 0.0005790367722511292, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028444976778700948, "entropy": 0.18636620789766312, "epoch": 0.02048, "grad_norm": 0.2497655749320984, "learning_rate": 1.428e-06, "loss": 0.0009, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.6015625, "completions/mean_terminated_length": 185.40000915527344, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19089756906032562, "epoch": 0.02056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4336000000000001e-06, "loss": 0.0, "num_tokens": 12332358.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20137739181518555, "epoch": 0.02064, "grad_norm": 0.0, "learning_rate": 1.4392e-06, "loss": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.7734375, "completions/mean_terminated_length": 205.01980590820312, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17926256358623505, "epoch": 0.02072, "frac_reward_zero_std": 0.875, "grad_norm": 0.3687487840652466, "learning_rate": 1.4448e-06, "loss": 0.0092, "num_tokens": 12425513.0, "reward": 0.328125, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.328125, "rewards/reward_fn/std": 0.9400064945220947, "step": 259 }, { "clip_ratio/high_max": 0.00745870778337121, "clip_ratio/high_mean": 0.0018646769458428025, "clip_ratio/low_mean": 0.0002673796843737364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002132056630216539, "entropy": 0.18373054265975952, "epoch": 0.0208, "grad_norm": 0.43386057019233704, "learning_rate": 1.4504e-06, "loss": -0.0052, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 200.7578125, "completions/mean_terminated_length": 185.2899932861328, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.1907161921262741, "epoch": 0.02088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.456e-06, "loss": 0.0, "num_tokens": 12516746.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21125224232673645, "epoch": 0.02096, "grad_norm": 0.0, "learning_rate": 1.4616000000000001e-06, "loss": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 171.4453125, "completions/mean_terminated_length": 155.7870330810547, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.21418803930282593, "epoch": 0.02104, "frac_reward_zero_std": 0.875, "grad_norm": 0.5002255439758301, "learning_rate": 1.4672000000000001e-06, "loss": -0.0078, "num_tokens": 12604227.0, "reward": 0.3515625, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.3515625, "rewards/reward_fn/std": 1.0393050909042358, "step": 263 }, { "clip_ratio/high_max": 0.002755390596576035, "clip_ratio/high_mean": 0.0006888476491440088, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006888476491440088, "entropy": 0.23101773113012314, "epoch": 0.02112, "grad_norm": 0.13350734114646912, "learning_rate": 1.4728e-06, "loss": -0.0157, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 144.2890625, "completions/mean_terminated_length": 136.8416748046875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.24340461939573288, "epoch": 0.0212, "frac_reward_zero_std": 0.75, "grad_norm": 1.222565770149231, "learning_rate": 1.4784e-06, "loss": 0.0061, "num_tokens": 12688232.0, "reward": 1.359375, "reward_std": 0.27326756715774536, "rewards/reward_fn/mean": 1.359375, "rewards/reward_fn/std": 1.4992616176605225, "step": 265 }, { "clip_ratio/high_max": 0.007126410258933902, "clip_ratio/high_mean": 0.00202274601906538, "clip_ratio/low_mean": 0.0010456624440848827, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003068408463150263, "entropy": 0.24800540506839752, "epoch": 0.02128, "grad_norm": 0.8375065922737122, "learning_rate": 1.484e-06, "loss": 0.0114, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 166.0859375, "completions/mean_terminated_length": 149.4351806640625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.20299920439720154, "epoch": 0.02136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4895999999999999e-06, "loss": 0.0, "num_tokens": 12775027.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22778868675231934, "epoch": 0.02144, "grad_norm": 0.0, "learning_rate": 1.4952e-06, "loss": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 138.515625, "completions/mean_terminated_length": 129.63026428222656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.2100439965724945, "epoch": 0.02152, "frac_reward_zero_std": 0.875, "grad_norm": 0.40798038244247437, "learning_rate": 1.5008e-06, "loss": -0.0017, "num_tokens": 12858293.0, "reward": 0.1399293839931488, "reward_std": 0.16770508885383606, "rewards/reward_fn/mean": 0.1399293839931488, "rewards/reward_fn/std": 0.525596022605896, "step": 269 }, { "clip_ratio/high_max": 0.001054852269589901, "clip_ratio/high_mean": 0.00026371306739747524, "clip_ratio/low_mean": 0.0008598729327786714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011235860583838075, "entropy": 0.209209106862545, "epoch": 0.0216, "grad_norm": 0.47167539596557617, "learning_rate": 1.5064e-06, "loss": -0.0051, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 186.4609375, "completions/mean_terminated_length": 176.52679443359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1940905824303627, "epoch": 0.02168, "frac_reward_zero_std": 0.875, "grad_norm": 0.42383500933647156, "learning_rate": 1.5119999999999999e-06, "loss": -0.0138, "num_tokens": 12947696.0, "reward": 0.4146711230278015, "reward_std": 0.015486023388803005, "rewards/reward_fn/mean": 0.4146711230278015, "rewards/reward_fn/std": 0.9874484539031982, "step": 271 }, { "clip_ratio/high_max": 0.003764873370528221, "clip_ratio/high_mean": 0.0009412183426320553, "clip_ratio/low_mean": 7.725587056484073e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010184741986449808, "entropy": 0.22626465559005737, "epoch": 0.02176, "grad_norm": 0.3829929828643799, "learning_rate": 1.5175999999999999e-06, "loss": -0.0041, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 204.96875, "completions/mean_terminated_length": 196.07337951660156, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.18575860559940338, "epoch": 0.02184, "frac_reward_zero_std": 0.875, "grad_norm": 0.16161485016345978, "learning_rate": 1.5231999999999999e-06, "loss": 0.0144, "num_tokens": 13039468.0, "reward": 0.7734375, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.7734375, "rewards/reward_fn/std": 1.3174470663070679, "step": 273 }, { "clip_ratio/high_max": 0.0014148274203762412, "clip_ratio/high_mean": 0.0003537068550940603, "clip_ratio/low_mean": 0.0014780304045416415, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018317372305318713, "entropy": 0.17982332408428192, "epoch": 0.02192, "grad_norm": 0.4799412786960602, "learning_rate": 1.5288e-06, "loss": -0.0203, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.6328125, "completions/mean_terminated_length": 193.35513305664062, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1774713397026062, "epoch": 0.022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5344e-06, "loss": 0.0, "num_tokens": 13131069.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17058352380990982, "epoch": 0.02208, "grad_norm": 0.0, "learning_rate": 1.54e-06, "loss": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 202.0800018310547, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1736796423792839, "epoch": 0.02216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5455999999999999e-06, "loss": 0.0, "num_tokens": 13223981.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1748671531677246, "epoch": 0.02224, "grad_norm": 0.0, "learning_rate": 1.5511999999999999e-06, "loss": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 205.8203125, "completions/mean_terminated_length": 191.1212158203125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.18593861162662506, "epoch": 0.02232, "frac_reward_zero_std": 0.875, "grad_norm": 0.6945249438285828, "learning_rate": 1.5567999999999999e-06, "loss": 0.0195, "num_tokens": 13315862.0, "reward": 0.11920405924320221, "reward_std": 0.020657330751419067, "rewards/reward_fn/mean": 0.11920405924320221, "rewards/reward_fn/std": 0.24625369906425476, "step": 279 }, { "clip_ratio/high_max": 0.0036061499267816544, "clip_ratio/high_mean": 0.0009015374816954136, "clip_ratio/low_mean": 0.00033103814348578453, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012325755960773677, "entropy": 0.1890474483370781, "epoch": 0.0224, "grad_norm": 0.4590570032596588, "learning_rate": 1.5624e-06, "loss": -0.0247, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 194.5859375, "completions/mean_terminated_length": 181.1333465576172, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1852336972951889, "epoch": 0.02248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.568e-06, "loss": 0.0, "num_tokens": 13406305.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18563683331012726, "epoch": 0.02256, "grad_norm": 0.0, "learning_rate": 1.5736e-06, "loss": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 198.9765625, "completions/mean_terminated_length": 181.52040100097656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.1995493471622467, "epoch": 0.02264, "frac_reward_zero_std": 0.75, "grad_norm": 0.6608814597129822, "learning_rate": 1.5791999999999998e-06, "loss": -0.0367, "num_tokens": 13497310.0, "reward": 0.25766098499298096, "reward_std": 0.24794167280197144, "rewards/reward_fn/mean": 0.25766098499298096, "rewards/reward_fn/std": 0.7234182953834534, "step": 283 }, { "clip_ratio/high_max": 0.007884211954660714, "clip_ratio/high_mean": 0.0019710529886651784, "clip_ratio/low_mean": 0.002378432545810938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004349485505372286, "entropy": 0.21283648163080215, "epoch": 0.02272, "grad_norm": 0.45125535130500793, "learning_rate": 1.5847999999999998e-06, "loss": 0.026, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 192.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.1775733157992363, "epoch": 0.0228, "frac_reward_zero_std": 0.875, "grad_norm": 0.39479178190231323, "learning_rate": 1.5904e-06, "loss": 0.0076, "num_tokens": 13588766.0, "reward": 0.44500401616096497, "reward_std": 0.06376247107982635, "rewards/reward_fn/mean": 0.4450039863586426, "rewards/reward_fn/std": 1.0022555589675903, "step": 285 }, { "clip_ratio/high_max": 0.0019309238996356726, "clip_ratio/high_mean": 0.00048273097490891814, "clip_ratio/low_mean": 0.0009654619498178363, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014481929247267544, "entropy": 0.17802564799785614, "epoch": 0.02288, "grad_norm": 0.42137759923934937, "learning_rate": 1.596e-06, "loss": 0.0036, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 199.25926208496094, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.1837410107254982, "epoch": 0.02296, "frac_reward_zero_std": 0.875, "grad_norm": 0.6215353012084961, "learning_rate": 1.6016e-06, "loss": 0.028, "num_tokens": 13680942.0, "reward": 0.37937265634536743, "reward_std": 0.0017069148598238826, "rewards/reward_fn/mean": 0.37937265634536743, "rewards/reward_fn/std": 0.9944735169410706, "step": 287 }, { "clip_ratio/high_max": 0.0063039439264684916, "clip_ratio/high_mean": 0.0015759859816171229, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015759859816171229, "entropy": 0.18238450586795807, "epoch": 0.02304, "grad_norm": 0.13702423870563507, "learning_rate": 1.6072e-06, "loss": -0.0211, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 190.5546875, "completions/mean_terminated_length": 179.1467742919922, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.19844719022512436, "epoch": 0.02312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6127999999999998e-06, "loss": 0.0, "num_tokens": 13770869.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20333098620176315, "epoch": 0.0232, "grad_norm": 0.0, "learning_rate": 1.6183999999999998e-06, "loss": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 166.5390625, "completions/mean_terminated_length": 150.9449462890625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.20631413906812668, "epoch": 0.02328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.624e-06, "loss": 0.0, "num_tokens": 13857722.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20145953446626663, "epoch": 0.02336, "grad_norm": 0.0, "learning_rate": 1.6296e-06, "loss": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 180.5703125, "completions/mean_terminated_length": 169.0180206298828, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.18313108384609222, "epoch": 0.02344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6352e-06, "loss": 0.0, "num_tokens": 13946371.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1855594888329506, "epoch": 0.02352, "grad_norm": 0.0, "learning_rate": 1.6408e-06, "loss": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 182.3515625, "completions/mean_terminated_length": 167.0660400390625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.18440038710832596, "epoch": 0.0236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6464e-06, "loss": 0.0, "num_tokens": 14035248.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18895912170410156, "epoch": 0.02368, "grad_norm": 0.0, "learning_rate": 1.6519999999999998e-06, "loss": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 177.5234375, "completions/mean_terminated_length": 167.106201171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.17787007242441177, "epoch": 0.02376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6576e-06, "loss": 0.0, "num_tokens": 14123507.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17699190229177475, "epoch": 0.02384, "grad_norm": 0.0, "learning_rate": 1.6632e-06, "loss": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 157.0370330810547, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.19304759800434113, "epoch": 0.02392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6688e-06, "loss": 0.0, "num_tokens": 14211123.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17903666198253632, "epoch": 0.024, "grad_norm": 0.0, "learning_rate": 1.6744e-06, "loss": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 170.9375, "completions/mean_terminated_length": 156.11009216308594, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.18391264230012894, "epoch": 0.02408, "frac_reward_zero_std": 0.875, "grad_norm": 0.34253427386283875, "learning_rate": 1.68e-06, "loss": 0.0091, "num_tokens": 14298539.0, "reward": 0.4921875, "reward_std": 0.17951758205890656, "rewards/reward_fn/mean": 0.4921875, "rewards/reward_fn/std": 1.115362286567688, "step": 301 }, { "clip_ratio/high_max": 0.003585657337680459, "clip_ratio/high_mean": 0.0008964143344201148, "clip_ratio/low_mean": 0.0008116252720355988, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017080396064557135, "entropy": 0.18513204902410507, "epoch": 0.02416, "grad_norm": 0.663201630115509, "learning_rate": 1.6855999999999998e-06, "loss": -0.0257, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 175.28125, "completions/mean_terminated_length": 162.07272338867188, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.18765071779489517, "epoch": 0.02424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6912e-06, "loss": 0.0, "num_tokens": 14386511.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20041364431381226, "epoch": 0.02432, "grad_norm": 0.0, "learning_rate": 1.6968e-06, "loss": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 155.8984375, "completions/mean_terminated_length": 146.4871826171875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.19057995080947876, "epoch": 0.0244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7024e-06, "loss": 0.0, "num_tokens": 14472002.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17885561287403107, "epoch": 0.02448, "grad_norm": 0.0, "learning_rate": 1.708e-06, "loss": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 161.15625, "completions/mean_terminated_length": 158.09677124023438, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.1976938396692276, "epoch": 0.02456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7136e-06, "loss": 0.0, "num_tokens": 14558166.0, "reward": 0.11952967941761017, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11952967941761017, "rewards/reward_fn/std": 0.317488431930542, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1956879049539566, "epoch": 0.02464, "grad_norm": 0.0, "learning_rate": 1.7192e-06, "loss": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 175.0390625, "completions/mean_terminated_length": 158.23585510253906, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.19270358979701996, "epoch": 0.02472, "frac_reward_zero_std": 0.75, "grad_norm": 1.1206114292144775, "learning_rate": 1.7248e-06, "loss": -0.0396, "num_tokens": 14646107.0, "reward": 0.0707806870341301, "reward_std": 0.1530400812625885, "rewards/reward_fn/mean": 0.0707806870341301, "rewards/reward_fn/std": 0.4556065499782562, "step": 309 }, { "clip_ratio/high_max": 0.0011363636003807187, "clip_ratio/high_mean": 0.0002840909000951797, "clip_ratio/low_mean": 0.0013764658942818642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016605568234808743, "entropy": 0.1778305545449257, "epoch": 0.0248, "grad_norm": 0.6063694953918457, "learning_rate": 1.7304e-06, "loss": -0.0182, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 172.359375, "completions/mean_terminated_length": 152.05825805664062, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.17683938145637512, "epoch": 0.02488, "frac_reward_zero_std": 0.875, "grad_norm": 0.16685272753238678, "learning_rate": 1.736e-06, "loss": -0.0203, "num_tokens": 14733705.0, "reward": 0.70509934425354, "reward_std": 0.00595592288300395, "rewards/reward_fn/mean": 0.70509934425354, "rewards/reward_fn/std": 1.0902820825576782, "step": 311 }, { "clip_ratio/high_max": 0.0035390615521464497, "clip_ratio/high_mean": 0.0008847653880366124, "clip_ratio/low_mean": 0.00019893898570444435, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010837043810170144, "entropy": 0.16889777779579163, "epoch": 0.02496, "grad_norm": 0.5199302434921265, "learning_rate": 1.7416e-06, "loss": 0.0233, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 164.421875, "completions/mean_terminated_length": 143.28846740722656, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.203529492020607, "epoch": 0.02504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7472e-06, "loss": 0.0, "num_tokens": 14820287.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18073301017284393, "epoch": 0.02512, "grad_norm": 0.0, "learning_rate": 1.7528e-06, "loss": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 164.2061767578125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.18012353032827377, "epoch": 0.0252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7583999999999998e-06, "loss": 0.0, "num_tokens": 14909687.0, "reward": 0.11815997213125229, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11815997213125229, "rewards/reward_fn/std": 0.2917759418487549, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18340276181697845, "epoch": 0.02528, "grad_norm": 0.0, "learning_rate": 1.764e-06, "loss": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 191.359375, "completions/mean_terminated_length": 173.25999450683594, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.17996052652597427, "epoch": 0.02536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7696000000000002e-06, "loss": 0.0, "num_tokens": 14999717.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17967627197504044, "epoch": 0.02544, "grad_norm": 0.0, "learning_rate": 1.7752e-06, "loss": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 192.3515625, "completions/mean_terminated_length": 162.3563232421875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.15872837603092194, "epoch": 0.02552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7808000000000002e-06, "loss": 0.0, "num_tokens": 15089874.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15924812108278275, "epoch": 0.0256, "grad_norm": 0.0, "learning_rate": 1.7864e-06, "loss": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 197.90625, "completions/mean_terminated_length": 171.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1654708981513977, "epoch": 0.02568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.792e-06, "loss": 0.0, "num_tokens": 15180742.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16694258153438568, "epoch": 0.02576, "grad_norm": 0.0, "learning_rate": 1.7975999999999997e-06, "loss": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 177.359375, "completions/mean_terminated_length": 151.14584350585938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.1720457375049591, "epoch": 0.02584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8032e-06, "loss": 0.0, "num_tokens": 15268980.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1696223020553589, "epoch": 0.02592, "grad_norm": 0.0, "learning_rate": 1.8088000000000002e-06, "loss": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 191.0859375, "completions/mean_terminated_length": 166.65591430664062, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.16223601251840591, "epoch": 0.026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8144e-06, "loss": 0.0, "num_tokens": 15358975.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16985199600458145, "epoch": 0.02608, "grad_norm": 0.0, "learning_rate": 1.82e-06, "loss": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.7734375, "completions/mean_terminated_length": 184.39773559570312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.17064998298883438, "epoch": 0.02616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8255999999999997e-06, "loss": 0.0, "num_tokens": 15450978.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16440509259700775, "epoch": 0.02624, "grad_norm": 0.0, "learning_rate": 1.8312e-06, "loss": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.6015625, "completions/mean_terminated_length": 193.8800048828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1422550305724144, "epoch": 0.02632, "frac_reward_zero_std": 0.875, "grad_norm": 0.3767935037612915, "learning_rate": 1.8368000000000001e-06, "loss": -0.0127, "num_tokens": 15544623.0, "reward": 0.703125, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.703125, "rewards/reward_fn/std": 1.2758160829544067, "step": 329 }, { "clip_ratio/high_max": 0.003677938599139452, "clip_ratio/high_mean": 0.000919484649784863, "clip_ratio/low_mean": 0.00015225335664581507, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010717380209825933, "entropy": 0.1506471112370491, "epoch": 0.0264, "grad_norm": 0.42855095863342285, "learning_rate": 1.8424e-06, "loss": 0.0053, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.8671875, "completions/mean_terminated_length": 192.46534729003906, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.16821109503507614, "epoch": 0.02648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8480000000000001e-06, "loss": 0.0, "num_tokens": 15636510.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16880616545677185, "epoch": 0.02656, "grad_norm": 0.0, "learning_rate": 1.8536e-06, "loss": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 197.4296875, "completions/mean_terminated_length": 175.38710021972656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.1544022485613823, "epoch": 0.02664, "frac_reward_zero_std": 0.875, "grad_norm": 0.14662699401378632, "learning_rate": 1.8592e-06, "loss": -0.0169, "num_tokens": 15727317.0, "reward": 0.4093271493911743, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.4093271493911743, "rewards/reward_fn/std": 0.9597022533416748, "step": 333 }, { "clip_ratio/high_max": 0.003205543733201921, "clip_ratio/high_mean": 0.0008013859333004802, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008013859333004802, "entropy": 0.15783175826072693, "epoch": 0.02672, "grad_norm": 0.477056622505188, "learning_rate": 1.8648000000000001e-06, "loss": -0.0022, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 200.6171875, "completions/mean_terminated_length": 178.09890747070312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.1610943302512169, "epoch": 0.0268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8704e-06, "loss": 0.0, "num_tokens": 15818532.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16404956579208374, "epoch": 0.02688, "grad_norm": 0.0, "learning_rate": 1.8760000000000001e-06, "loss": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.8671875, "completions/mean_terminated_length": 199.25973510742188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.15018602460622787, "epoch": 0.02696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8816e-06, "loss": 0.0, "num_tokens": 15912467.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14568492770195007, "epoch": 0.02704, "grad_norm": 0.0, "learning_rate": 1.8872000000000001e-06, "loss": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.515625, "completions/mean_terminated_length": 210.20455932617188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1500326469540596, "epoch": 0.02712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.8927999999999997e-06, "loss": 0.0, "num_tokens": 16006741.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1539461612701416, "epoch": 0.0272, "grad_norm": 0.0, "learning_rate": 1.8984e-06, "loss": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.0390625, "completions/mean_terminated_length": 218.73611450195312, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.13350392878055573, "epoch": 0.02728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9040000000000001e-06, "loss": 0.0, "num_tokens": 16102362.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1415727138519287, "epoch": 0.02736, "grad_norm": 0.0, "learning_rate": 1.9096e-06, "loss": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.0703125, "completions/mean_terminated_length": 213.96340942382812, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14176516979932785, "epoch": 0.02744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9152e-06, "loss": 0.0, "num_tokens": 16197219.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13524317741394043, "epoch": 0.02752, "grad_norm": 0.0, "learning_rate": 1.9208e-06, "loss": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.0390625, "completions/mean_terminated_length": 221.01190185546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.15156207978725433, "epoch": 0.0276, "frac_reward_zero_std": 0.875, "grad_norm": 0.37506213784217834, "learning_rate": 1.9264e-06, "loss": 0.0045, "num_tokens": 16292584.0, "reward": 0.3046875, "reward_std": 0.15116733312606812, "rewards/reward_fn/mean": 0.3046875, "rewards/reward_fn/std": 0.9097771048545837, "step": 345 }, { "clip_ratio/high_max": 0.0048877166118472815, "clip_ratio/high_mean": 0.0012219291529618204, "clip_ratio/low_mean": 0.000155472633196041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013774017861578614, "entropy": 0.14259923249483109, "epoch": 0.02768, "grad_norm": 0.3965364098548889, "learning_rate": 1.9320000000000003e-06, "loss": -0.0243, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 216.7058868408203, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.15057149529457092, "epoch": 0.02776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9376e-06, "loss": 0.0, "num_tokens": 16388216.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1490882933139801, "epoch": 0.02784, "grad_norm": 0.0, "learning_rate": 1.9432e-06, "loss": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.453125, "completions/mean_terminated_length": 219.48275756835938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.145612433552742, "epoch": 0.02792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9488e-06, "loss": 0.0, "num_tokens": 16484402.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14159974455833435, "epoch": 0.028, "grad_norm": 0.0, "learning_rate": 1.9544e-06, "loss": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 194.7012939453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.142218716442585, "epoch": 0.02808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9600000000000003e-06, "loss": 0.0, "num_tokens": 16577986.0, "reward": 0.468217134475708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.468217134475708, "rewards/reward_fn/std": 0.9862108826637268, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14754539728164673, "epoch": 0.02816, "grad_norm": 0.0, "learning_rate": 1.9656e-06, "loss": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 213.90769958496094, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.14606352895498276, "epoch": 0.02824, "frac_reward_zero_std": 0.875, "grad_norm": 0.4806402027606964, "learning_rate": 1.9712000000000003e-06, "loss": -0.0034, "num_tokens": 16673554.0, "reward": 0.2751990556716919, "reward_std": 0.17951758205890656, "rewards/reward_fn/mean": 0.2751990556716919, "rewards/reward_fn/std": 0.8400203585624695, "step": 353 }, { "clip_ratio/high_max": 0.006923143984749913, "clip_ratio/high_mean": 0.0017307859961874783, "clip_ratio/low_mean": 0.000722617405699566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024534034309908748, "entropy": 0.15055270493030548, "epoch": 0.02832, "grad_norm": 0.5553947687149048, "learning_rate": 1.9768e-06, "loss": -0.0025, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.7421875, "completions/mean_terminated_length": 213.5192413330078, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.13307589292526245, "epoch": 0.0284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.9824e-06, "loss": 0.0, "num_tokens": 16769649.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14007706195116043, "epoch": 0.02848, "grad_norm": 0.0, "learning_rate": 1.988e-06, "loss": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.046875, "completions/mean_terminated_length": 207.14706420898438, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.14946308732032776, "epoch": 0.02856, "frac_reward_zero_std": 0.875, "grad_norm": 0.6103929281234741, "learning_rate": 1.9936e-06, "loss": -0.014, "num_tokens": 16864631.0, "reward": 0.046875, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": 0.046875, "rewards/reward_fn/std": 0.37352070212364197, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0011671485262922943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011671485262922943, "entropy": 0.14797227084636688, "epoch": 0.02864, "grad_norm": 0.14220835268497467, "learning_rate": 1.9992000000000003e-06, "loss": 0.0223, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.9140625, "completions/mean_terminated_length": 189.49383544921875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.14692170917987823, "epoch": 0.02872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0048e-06, "loss": 0.0, "num_tokens": 16957548.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14754343777894974, "epoch": 0.0288, "grad_norm": 0.0, "learning_rate": 2.0104e-06, "loss": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5625, "completions/mean_terminated_length": 216.8000030517578, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14531764388084412, "epoch": 0.02888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.016e-06, "loss": 0.0, "num_tokens": 17053108.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1418425664305687, "epoch": 0.02896, "grad_norm": 0.0, "learning_rate": 2.0216e-06, "loss": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.890625, "completions/mean_terminated_length": 190.3488311767578, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.15255913883447647, "epoch": 0.02904, "frac_reward_zero_std": 0.875, "grad_norm": 0.1185266524553299, "learning_rate": 2.0272000000000003e-06, "loss": -0.0183, "num_tokens": 17145766.0, "reward": 0.7265625, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.7265625, "rewards/reward_fn/std": 1.3440732955932617, "step": 363 }, { "clip_ratio/high_max": 0.003906658734194934, "clip_ratio/high_mean": 0.0009766646835487336, "clip_ratio/low_mean": 0.00019369834626559168, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00117036301526241, "entropy": 0.15786854922771454, "epoch": 0.02912, "grad_norm": 0.7877041101455688, "learning_rate": 2.0328e-06, "loss": 0.0195, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.0859375, "completions/mean_terminated_length": 207.93505859375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.1589801385998726, "epoch": 0.0292, "frac_reward_zero_std": 0.875, "grad_norm": 0.5988402962684631, "learning_rate": 2.0384000000000003e-06, "loss": 0.0106, "num_tokens": 17240369.0, "reward": -0.01562662050127983, "reward_std": 0.18749870359897614, "rewards/reward_fn/mean": -0.01562662050127983, "rewards/reward_fn/std": 0.5171722769737244, "step": 365 }, { "clip_ratio/high_max": 0.007468935858923942, "clip_ratio/high_mean": 0.0018672339647309855, "clip_ratio/low_mean": 0.00037515006260946393, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002242384012788534, "entropy": 0.15386903285980225, "epoch": 0.02928, "grad_norm": 0.3450712561607361, "learning_rate": 2.044e-06, "loss": -0.0037, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 222.3671875, "completions/mean_terminated_length": 203.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.14923560619354248, "epoch": 0.02936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0496e-06, "loss": 0.0, "num_tokens": 17334368.0, "reward": 0.4347124993801117, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4347124993801117, "rewards/reward_fn/std": 0.9859739542007446, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13845194876194, "epoch": 0.02944, "grad_norm": 0.0, "learning_rate": 2.0552000000000002e-06, "loss": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.4140625, "completions/mean_terminated_length": 202.80262756347656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.14006426185369492, "epoch": 0.02952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0608e-06, "loss": 0.0, "num_tokens": 17428629.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1400972306728363, "epoch": 0.0296, "grad_norm": 0.0, "learning_rate": 2.0664000000000002e-06, "loss": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 211.9609375, "completions/mean_terminated_length": 196.0319061279297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.15661174058914185, "epoch": 0.02968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.072e-06, "loss": 0.0, "num_tokens": 17521296.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15702302008867264, "epoch": 0.02976, "grad_norm": 0.0, "learning_rate": 2.0776000000000002e-06, "loss": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.0546875, "completions/mean_terminated_length": 206.52688598632812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.1423799842596054, "epoch": 0.02984, "frac_reward_zero_std": 0.875, "grad_norm": 0.1490129828453064, "learning_rate": 2.0832e-06, "loss": 0.0153, "num_tokens": 17614999.0, "reward": 0.12606748938560486, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.12606748938560486, "rewards/reward_fn/std": 0.37386465072631836, "step": 373 }, { "clip_ratio/high_max": 0.0002923976571764797, "clip_ratio/high_mean": 7.309941429411992e-05, "clip_ratio/low_mean": 0.0011123365839011967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011854359763674438, "entropy": 0.1440172716975212, "epoch": 0.02992, "grad_norm": 0.14059266448020935, "learning_rate": 2.0888e-06, "loss": 0.0139, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.5078125, "completions/mean_terminated_length": 201.64515686035156, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.1483236849308014, "epoch": 0.03, "frac_reward_zero_std": 0.875, "grad_norm": 0.8398140668869019, "learning_rate": 2.0944000000000002e-06, "loss": -0.0134, "num_tokens": 17708248.0, "reward": 0.46875, "reward_std": 0.16770508885383606, "rewards/reward_fn/mean": 0.46875, "rewards/reward_fn/std": 1.0935566425323486, "step": 375 }, { "clip_ratio/high_max": 0.0011567380279302597, "clip_ratio/high_mean": 0.0002891845069825649, "clip_ratio/low_mean": 0.0023776424350216985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026668269420042634, "entropy": 0.15574173629283905, "epoch": 0.03008, "grad_norm": 0.39410996437072754, "learning_rate": 2.1e-06, "loss": 0.0184, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.265625, "completions/mean_terminated_length": 213.24488830566406, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.15130683034658432, "epoch": 0.03016, "frac_reward_zero_std": 0.875, "grad_norm": 0.7754151821136475, "learning_rate": 2.1056000000000002e-06, "loss": 0.0191, "num_tokens": 17802362.0, "reward": -0.0234375, "reward_std": 0.09375, "rewards/reward_fn/mean": -0.0234375, "rewards/reward_fn/std": 0.2651650309562683, "step": 377 }, { "clip_ratio/high_max": 0.00475650280714035, "clip_ratio/high_mean": 0.0011891257017850876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011891257017850876, "entropy": 0.1448696255683899, "epoch": 0.03024, "grad_norm": 0.10644811391830444, "learning_rate": 2.1112e-06, "loss": -0.0125, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 202.7265625, "completions/mean_terminated_length": 184.22105407714844, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.1383971869945526, "epoch": 0.03032, "frac_reward_zero_std": 0.625, "grad_norm": 1.1743847131729126, "learning_rate": 2.1168e-06, "loss": -0.0168, "num_tokens": 17893847.0, "reward": 0.6516562104225159, "reward_std": 0.2454221248626709, "rewards/reward_fn/mean": 0.6516562104225159, "rewards/reward_fn/std": 1.2238435745239258, "step": 379 }, { "clip_ratio/high_max": 0.00806507864035666, "clip_ratio/high_mean": 0.0034233261831104755, "clip_ratio/low_mean": 0.00266525789629668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006088584195822477, "entropy": 0.1409609466791153, "epoch": 0.0304, "grad_norm": 0.9362122416496277, "learning_rate": 2.1224e-06, "loss": 0.0246, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 197.40625, "completions/mean_terminated_length": 191.89744567871094, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.16097066551446915, "epoch": 0.03048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1279999999999998e-06, "loss": 0.0, "num_tokens": 17984651.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1469614952802658, "epoch": 0.03056, "grad_norm": 0.0, "learning_rate": 2.1336e-06, "loss": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 180.234375, "completions/mean_terminated_length": 158.0404052734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15548190474510193, "epoch": 0.03064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1391999999999998e-06, "loss": 0.0, "num_tokens": 18073257.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16835768520832062, "epoch": 0.03072, "grad_norm": 0.0, "learning_rate": 2.1448e-06, "loss": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 162.0859375, "completions/mean_terminated_length": 149.6194610595703, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.1699100285768509, "epoch": 0.0308, "frac_reward_zero_std": 0.625, "grad_norm": 1.3187729120254517, "learning_rate": 2.1503999999999998e-06, "loss": 0.0086, "num_tokens": 18159540.0, "reward": 0.1171875, "reward_std": 0.44895508885383606, "rewards/reward_fn/mean": 0.1171875, "rewards/reward_fn/std": 1.4775010347366333, "step": 385 }, { "clip_ratio/high_max": 0.014244725927710533, "clip_ratio/high_mean": 0.003739498322829604, "clip_ratio/low_mean": 0.006140784360468388, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009880282450467348, "entropy": 0.16592178493738174, "epoch": 0.03088, "grad_norm": 0.5906989574432373, "learning_rate": 2.1559999999999998e-06, "loss": -0.0029, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 148.3076934814453, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.16492831707000732, "epoch": 0.03096, "frac_reward_zero_std": 0.875, "grad_norm": 0.6211070418357849, "learning_rate": 2.1616e-06, "loss": -0.0042, "num_tokens": 18245244.0, "reward": 0.3558419346809387, "reward_std": 0.13916553556919098, "rewards/reward_fn/mean": 0.3558419346809387, "rewards/reward_fn/std": 1.0749855041503906, "step": 387 }, { "clip_ratio/high_max": 0.009031684137880802, "clip_ratio/high_mean": 0.0022579210344702005, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022579210344702005, "entropy": 0.1649087443947792, "epoch": 0.03104, "grad_norm": 0.10049033164978027, "learning_rate": 2.1671999999999998e-06, "loss": -0.0115, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 136.78125, "completions/mean_terminated_length": 135.84251403808594, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.17660729587078094, "epoch": 0.03112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1728e-06, "loss": 0.0, "num_tokens": 18328288.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.164266437292099, "epoch": 0.0312, "grad_norm": 0.0, "learning_rate": 2.1783999999999998e-06, "loss": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 122.19354248046875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16850634664297104, "epoch": 0.03128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1839999999999998e-06, "loss": 0.0, "num_tokens": 18410000.0, "reward": 0.45377030968666077, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45377030968666077, "rewards/reward_fn/std": 0.9766225814819336, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17100857943296432, "epoch": 0.03136, "grad_norm": 0.0, "learning_rate": 2.1896e-06, "loss": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 118.203125, "completions/mean_terminated_length": 114.89600372314453, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.18057692050933838, "epoch": 0.03144, "frac_reward_zero_std": 0.75, "grad_norm": 0.6576926708221436, "learning_rate": 2.1951999999999998e-06, "loss": -0.0434, "num_tokens": 18490666.0, "reward": 0.0703125, "reward_std": 0.3499237596988678, "rewards/reward_fn/mean": 0.0703125, "rewards/reward_fn/std": 0.7954951524734497, "step": 393 }, { "clip_ratio/high_max": 0.0038411974674090743, "clip_ratio/high_mean": 0.0015583710046485066, "clip_ratio/low_mean": 0.0011236095451749861, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002681980491615832, "entropy": 0.1817432940006256, "epoch": 0.03152, "grad_norm": 1.2379415035247803, "learning_rate": 2.2008e-06, "loss": -0.0023, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 105.7265625, "completions/mean_terminated_length": 103.34127807617188, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.185156911611557, "epoch": 0.0316, "frac_reward_zero_std": 0.875, "grad_norm": 0.8625510334968567, "learning_rate": 2.2063999999999997e-06, "loss": 0.0142, "num_tokens": 18569735.0, "reward": 0.31217852234840393, "reward_std": 0.15116733312606812, "rewards/reward_fn/mean": 0.31217852234840393, "rewards/reward_fn/std": 1.1174463033676147, "step": 395 }, { "clip_ratio/high_max": 0.015673667658120394, "clip_ratio/high_mean": 0.0039184169145300984, "clip_ratio/low_mean": 0.0011346444953233004, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005053061409853399, "entropy": 0.1730366349220276, "epoch": 0.03168, "grad_norm": 0.44003888964653015, "learning_rate": 2.212e-06, "loss": -0.0136, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 81.40625, "completions/mean_terminated_length": 80.031494140625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.19638141244649887, "epoch": 0.03176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2176e-06, "loss": 0.0, "num_tokens": 18645691.0, "reward": 0.4999784827232361, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4999784827232361, "rewards/reward_fn/std": 1.0039185285568237, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20296336710453033, "epoch": 0.03184, "grad_norm": 0.0, "learning_rate": 2.2231999999999997e-06, "loss": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 77.90625, "completions/mean_terminated_length": 77.90625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.21066554635763168, "epoch": 0.03192, "frac_reward_zero_std": 0.875, "grad_norm": 0.4581476151943207, "learning_rate": 2.2288e-06, "loss": -0.0213, "num_tokens": 18721199.0, "reward": 0.3515625, "reward_std": 0.2151489406824112, "rewards/reward_fn/mean": 0.3515625, "rewards/reward_fn/std": 1.1677411794662476, "step": 399 }, { "clip_ratio/high_max": 0.0037280067335814238, "clip_ratio/high_mean": 0.0009320016833953559, "clip_ratio/low_mean": 0.000932557784835808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018645594245754182, "entropy": 0.19401860237121582, "epoch": 0.032, "grad_norm": 1.5305557250976562, "learning_rate": 2.2343999999999997e-06, "loss": 0.033, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 46.359375, "completions/mean_terminated_length": 44.70866012573242, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2561619058251381, "epoch": 0.03208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.24e-06, "loss": 0.0, "num_tokens": 18792669.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24456633627414703, "epoch": 0.03216, "grad_norm": 0.0, "learning_rate": 2.2455999999999997e-06, "loss": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 42.78125, "completions/mean_terminated_length": 42.78125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.25220803916454315, "epoch": 0.03224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2512e-06, "loss": 0.0, "num_tokens": 18863681.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24203155934810638, "epoch": 0.03232, "grad_norm": 0.0, "learning_rate": 2.2568e-06, "loss": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 43.96875, "completions/mean_terminated_length": 43.96875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.2368392050266266, "epoch": 0.0324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2623999999999997e-06, "loss": 0.0, "num_tokens": 18934845.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21570950001478195, "epoch": 0.03248, "grad_norm": 0.0, "learning_rate": 2.268e-06, "loss": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.65625, "completions/mean_terminated_length": 32.65625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.3047877699136734, "epoch": 0.03256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2735999999999997e-06, "loss": 0.0, "num_tokens": 19004561.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2958596646785736, "epoch": 0.03264, "grad_norm": 0.0, "learning_rate": 2.2792e-06, "loss": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 36.9375, "completions/mean_terminated_length": 36.9375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.28972695767879486, "epoch": 0.03272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2848e-06, "loss": 0.0, "num_tokens": 19074825.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28219233453273773, "epoch": 0.0328, "grad_norm": 0.0, "learning_rate": 2.2903999999999997e-06, "loss": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.20367726683616638, "epoch": 0.03288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.296e-06, "loss": 0.0, "num_tokens": 19144969.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19825753569602966, "epoch": 0.03296, "grad_norm": 0.0, "learning_rate": 2.3015999999999997e-06, "loss": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 40.328125, "completions/mean_terminated_length": 40.328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.25042860954999924, "epoch": 0.03304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3072e-06, "loss": 0.0, "num_tokens": 19215667.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22648985683918, "epoch": 0.03312, "grad_norm": 0.0, "learning_rate": 2.3128e-06, "loss": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 33.265625, "completions/mean_terminated_length": 33.265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.19880054891109467, "epoch": 0.0332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3184e-06, "loss": 0.0, "num_tokens": 19285461.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18751391023397446, "epoch": 0.03328, "grad_norm": 0.0, "learning_rate": 2.324e-06, "loss": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 32.3671875, "completions/mean_terminated_length": 32.3671875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.25560078769922256, "epoch": 0.03336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3295999999999997e-06, "loss": 0.0, "num_tokens": 19355140.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2554042786359787, "epoch": 0.03344, "grad_norm": 0.0, "learning_rate": 2.3352e-06, "loss": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 37.4140625, "completions/mean_terminated_length": 37.4140625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.24745354056358337, "epoch": 0.03352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3407999999999997e-06, "loss": 0.0, "num_tokens": 19425465.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25052276253700256, "epoch": 0.0336, "grad_norm": 0.0, "learning_rate": 2.3464e-06, "loss": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 34.6484375, "completions/mean_terminated_length": 34.6484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.2368476241827011, "epoch": 0.03368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.352e-06, "loss": 0.0, "num_tokens": 19495436.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23861224204301834, "epoch": 0.03376, "grad_norm": 0.0, "learning_rate": 2.3576e-06, "loss": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 37.1875, "completions/mean_terminated_length": 37.1875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.21681445091962814, "epoch": 0.03384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3632e-06, "loss": 0.0, "num_tokens": 19565732.0, "reward": 0.10771539807319641, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10771539807319641, "rewards/reward_fn/std": 0.28610795736312866, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21971473842859268, "epoch": 0.03392, "grad_norm": 0.0, "learning_rate": 2.3687999999999997e-06, "loss": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 32.8125, "completions/mean_terminated_length": 32.8125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.22385847568511963, "epoch": 0.034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3744e-06, "loss": 0.0, "num_tokens": 19635468.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23408270627260208, "epoch": 0.03408, "grad_norm": 0.0, "learning_rate": 2.38e-06, "loss": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 32.59375, "completions/mean_terminated_length": 32.59375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.24008037149906158, "epoch": 0.03416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3856e-06, "loss": 0.0, "num_tokens": 19705176.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23081659525632858, "epoch": 0.03424, "grad_norm": 0.0, "learning_rate": 2.3912e-06, "loss": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 43.453125, "completions/mean_terminated_length": 43.453125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.2153729721903801, "epoch": 0.03432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3967999999999997e-06, "loss": 0.0, "num_tokens": 19776274.0, "reward": 0.5739399194717407, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5739399194717407, "rewards/reward_fn/std": 0.985861599445343, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22236867994070053, "epoch": 0.0344, "grad_norm": 0.0, "learning_rate": 2.4024e-06, "loss": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 36.265625, "completions/mean_terminated_length": 36.265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.2392250895500183, "epoch": 0.03448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4079999999999996e-06, "loss": 0.0, "num_tokens": 19846452.0, "reward": 0.7817869186401367, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7817869186401367, "rewards/reward_fn/std": 1.2883555889129639, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25604553520679474, "epoch": 0.03456, "grad_norm": 0.0, "learning_rate": 2.4136e-06, "loss": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 38.328125, "completions/mean_terminated_length": 38.328125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.15897995233535767, "epoch": 0.03464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4192e-06, "loss": 0.0, "num_tokens": 19916894.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16450094431638718, "epoch": 0.03472, "grad_norm": 0.0, "learning_rate": 2.4248e-06, "loss": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 41.0390625, "completions/mean_terminated_length": 41.0390625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.19463489204645157, "epoch": 0.0348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4304e-06, "loss": 0.0, "num_tokens": 19987683.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20826875418424606, "epoch": 0.03488, "grad_norm": 0.0, "learning_rate": 2.4359999999999996e-06, "loss": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 33.71875, "completions/mean_terminated_length": 33.71875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.20781591534614563, "epoch": 0.03496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4416e-06, "loss": 0.0, "num_tokens": 20057535.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2088342308998108, "epoch": 0.03504, "grad_norm": 0.0, "learning_rate": 2.4472e-06, "loss": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 35.6640625, "completions/mean_terminated_length": 35.6640625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2069905400276184, "epoch": 0.03512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4528e-06, "loss": 0.0, "num_tokens": 20127636.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.210297591984272, "epoch": 0.0352, "grad_norm": 0.0, "learning_rate": 2.4584e-06, "loss": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.96875, "completions/mean_terminated_length": 31.96875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.2546781003475189, "epoch": 0.03528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4639999999999996e-06, "loss": 0.0, "num_tokens": 20197264.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25752606987953186, "epoch": 0.03536, "grad_norm": 0.0, "learning_rate": 2.4696e-06, "loss": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 36.375, "completions/mean_terminated_length": 36.375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 0.25916359573602676, "epoch": 0.03544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4752e-06, "loss": 0.0, "num_tokens": 20267456.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26926393806934357, "epoch": 0.03552, "grad_norm": 0.0, "learning_rate": 2.4808e-06, "loss": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 32.390625, "completions/mean_terminated_length": 32.390625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2613266259431839, "epoch": 0.0356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4864e-06, "loss": 0.0, "num_tokens": 20337138.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25910837948322296, "epoch": 0.03568, "grad_norm": 0.0, "learning_rate": 2.492e-06, "loss": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 39.5390625, "completions/mean_terminated_length": 39.5390625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.18105162680149078, "epoch": 0.03576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4976e-06, "loss": 0.0, "num_tokens": 20407735.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17712099105119705, "epoch": 0.03584, "grad_norm": 0.0, "learning_rate": 2.5031999999999996e-06, "loss": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 33.8515625, "completions/mean_terminated_length": 33.8515625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.24982718378305435, "epoch": 0.03592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5088e-06, "loss": 0.0, "num_tokens": 20477604.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24602626264095306, "epoch": 0.036, "grad_norm": 0.0, "learning_rate": 2.5144e-06, "loss": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 36.796875, "completions/mean_terminated_length": 36.796875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.24324693530797958, "epoch": 0.03608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.52e-06, "loss": 0.0, "num_tokens": 20547850.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2489098682999611, "epoch": 0.03616, "grad_norm": 0.0, "learning_rate": 2.5256e-06, "loss": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.1953125, "completions/mean_terminated_length": 33.1953125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.2733784168958664, "epoch": 0.03624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5312e-06, "loss": 0.0, "num_tokens": 20617635.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2620960772037506, "epoch": 0.03632, "grad_norm": 0.0, "learning_rate": 2.5368e-06, "loss": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 34.5546875, "completions/mean_terminated_length": 34.5546875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.22276544570922852, "epoch": 0.0364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5424e-06, "loss": 0.0, "num_tokens": 20687594.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2155904695391655, "epoch": 0.03648, "grad_norm": 0.0, "learning_rate": 2.548e-06, "loss": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 36.6171875, "completions/mean_terminated_length": 36.6171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.21754838526248932, "epoch": 0.03656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5536e-06, "loss": 0.0, "num_tokens": 20757817.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21110960096120834, "epoch": 0.03664, "grad_norm": 0.0, "learning_rate": 2.5592e-06, "loss": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 32.6640625, "completions/mean_terminated_length": 32.6640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.24139059334993362, "epoch": 0.03672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5648e-06, "loss": 0.0, "num_tokens": 20827534.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23926536738872528, "epoch": 0.0368, "grad_norm": 0.0, "learning_rate": 2.5704e-06, "loss": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 38.453125, "completions/mean_terminated_length": 38.453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.2310691848397255, "epoch": 0.03688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.576e-06, "loss": 0.0, "num_tokens": 20897992.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22855322808027267, "epoch": 0.03696, "grad_norm": 0.0, "learning_rate": 2.5816e-06, "loss": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 38.484375, "completions/mean_terminated_length": 38.484375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.23250539600849152, "epoch": 0.03704, "frac_reward_zero_std": 0.875, "grad_norm": 1.6323295831680298, "learning_rate": 2.5872e-06, "loss": 0.0533, "num_tokens": 20968454.0, "reward": 0.3515625, "reward_std": 0.09375, "rewards/reward_fn/mean": 0.3515625, "rewards/reward_fn/std": 0.9687222242355347, "step": 463 }, { "clip_ratio/high_max": 0.0024096386041492224, "clip_ratio/high_mean": 0.0006024096510373056, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006024096510373056, "entropy": 0.23732595890760422, "epoch": 0.03712, "grad_norm": 0.23431801795959473, "learning_rate": 2.5928e-06, "loss": -0.0277, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 31.40625, "completions/mean_terminated_length": 31.40625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.2525569796562195, "epoch": 0.0372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5984e-06, "loss": 0.0, "num_tokens": 21038010.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2441243827342987, "epoch": 0.03728, "grad_norm": 0.0, "learning_rate": 2.604e-06, "loss": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 33.125, "completions/mean_terminated_length": 33.125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.28054283559322357, "epoch": 0.03736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6096e-06, "loss": 0.0, "num_tokens": 21107786.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2820121645927429, "epoch": 0.03744, "grad_norm": 0.0, "learning_rate": 2.6152e-06, "loss": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 34.609375, "completions/mean_terminated_length": 34.609375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.26786385476589203, "epoch": 0.03752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6208e-06, "loss": 0.0, "num_tokens": 21177752.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24899546056985855, "epoch": 0.0376, "grad_norm": 0.0, "learning_rate": 2.6264e-06, "loss": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 34.7109375, "completions/mean_terminated_length": 34.7109375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.22337084263563156, "epoch": 0.03768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.632e-06, "loss": 0.0, "num_tokens": 21247731.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2152806669473648, "epoch": 0.03776, "grad_norm": 0.0, "learning_rate": 2.6376e-06, "loss": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 40.875, "completions/mean_terminated_length": 40.875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.20601772516965866, "epoch": 0.03784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6432e-06, "loss": 0.0, "num_tokens": 21318499.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19965504854917526, "epoch": 0.03792, "grad_norm": 0.0, "learning_rate": 2.6488e-06, "loss": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 36.3828125, "completions/mean_terminated_length": 36.3828125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.26158344745635986, "epoch": 0.038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6544e-06, "loss": 0.0, "num_tokens": 21388692.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26463960111141205, "epoch": 0.03808, "grad_norm": 0.0, "learning_rate": 2.66e-06, "loss": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 40.703125, "completions/mean_terminated_length": 40.703125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.20898433029651642, "epoch": 0.03816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6656e-06, "loss": 0.0, "num_tokens": 21459438.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20072608441114426, "epoch": 0.03824, "grad_norm": 0.0, "learning_rate": 2.6712e-06, "loss": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 39.546875, "completions/mean_terminated_length": 39.546875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.20889824628829956, "epoch": 0.03832, "frac_reward_zero_std": 0.875, "grad_norm": 2.1122493743896484, "learning_rate": 2.6768e-06, "loss": 0.0036, "num_tokens": 21530036.0, "reward": 0.47613078355789185, "reward_std": 0.03947741165757179, "rewards/reward_fn/mean": 0.47613078355789185, "rewards/reward_fn/std": 0.9998084902763367, "step": 479 }, { "clip_ratio/high_max": 0.015283661894500256, "clip_ratio/high_mean": 0.003820915473625064, "clip_ratio/low_mean": 0.0008976660901680589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004718581680208445, "entropy": 0.20591527223587036, "epoch": 0.0384, "grad_norm": 0.7259573936462402, "learning_rate": 2.6824e-06, "loss": -0.0053, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 39.6875, "completions/mean_terminated_length": 39.6875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.21302638947963715, "epoch": 0.03848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.688e-06, "loss": 0.0, "num_tokens": 21600652.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21653297543525696, "epoch": 0.03856, "grad_norm": 0.0, "learning_rate": 2.6936e-06, "loss": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 33.6640625, "completions/mean_terminated_length": 33.6640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.2644335627555847, "epoch": 0.03864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6992e-06, "loss": 0.0, "num_tokens": 21670497.0, "reward": 0.09049560874700546, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09049560874700546, "rewards/reward_fn/std": 0.2185191661119461, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2708878368139267, "epoch": 0.03872, "grad_norm": 0.0, "learning_rate": 2.7048e-06, "loss": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 41.125, "completions/mean_terminated_length": 41.125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.20904885232448578, "epoch": 0.0388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7104e-06, "loss": 0.0, "num_tokens": 21741297.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2187659740447998, "epoch": 0.03888, "grad_norm": 0.0, "learning_rate": 2.716e-06, "loss": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.14385749399662018, "epoch": 0.03896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7216e-06, "loss": 0.0, "num_tokens": 21816433.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13944195955991745, "epoch": 0.03904, "grad_norm": 0.0, "learning_rate": 2.7272e-06, "loss": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 99.4296875, "completions/mean_terminated_length": 99.4296875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.10257593914866447, "epoch": 0.03912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7328000000000003e-06, "loss": 0.0, "num_tokens": 21894696.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10321344807744026, "epoch": 0.0392, "grad_norm": 0.0, "learning_rate": 2.7384e-06, "loss": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 90.6484375, "completions/mean_terminated_length": 90.6484375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.10771527513861656, "epoch": 0.03928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.744e-06, "loss": 0.0, "num_tokens": 21971835.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10656651854515076, "epoch": 0.03936, "grad_norm": 0.0, "learning_rate": 2.7496e-06, "loss": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 89.234375, "completions/mean_terminated_length": 89.234375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.09489329159259796, "epoch": 0.03944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7552e-06, "loss": 0.0, "num_tokens": 22048793.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09601403400301933, "epoch": 0.03952, "grad_norm": 0.0, "learning_rate": 2.7608e-06, "loss": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 104.03125, "completions/mean_terminated_length": 104.03125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.10365442931652069, "epoch": 0.0396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7664e-06, "loss": 0.0, "num_tokens": 22127645.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10087046399712563, "epoch": 0.03968, "grad_norm": 0.0, "learning_rate": 2.772e-06, "loss": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 101.640625, "completions/mean_terminated_length": 101.640625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.10023761540651321, "epoch": 0.03976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7776e-06, "loss": 0.0, "num_tokens": 22206191.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10102415084838867, "epoch": 0.03984, "grad_norm": 0.0, "learning_rate": 2.7832e-06, "loss": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 103.1171875, "completions/mean_terminated_length": 103.1171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.08557040989398956, "epoch": 0.03992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7888e-06, "loss": 0.0, "num_tokens": 22284926.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08953556418418884, "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 2.7944e-06, "loss": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 96.6953125, "completions/mean_terminated_length": 96.6953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.10519624501466751, "epoch": 0.04008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "num_tokens": 22362839.0, "reward": 0.09723600745201111, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09723600745201111, "rewards/reward_fn/std": 0.2582731544971466, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0971563272178173, "epoch": 0.04016, "grad_norm": 0.0, "learning_rate": 2.8056e-06, "loss": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 94.7734375, "completions/mean_terminated_length": 94.7734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.11430378630757332, "epoch": 0.04024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8112e-06, "loss": 0.0, "num_tokens": 22440506.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11006181314587593, "epoch": 0.04032, "grad_norm": 0.0, "learning_rate": 2.8168e-06, "loss": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 110.5859375, "completions/mean_terminated_length": 110.5859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.08717502653598785, "epoch": 0.0404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8224e-06, "loss": 0.0, "num_tokens": 22520197.0, "reward": 0.48395901918411255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.48395901918411255, "rewards/reward_fn/std": 0.996755838394165, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08991051465272903, "epoch": 0.04048, "grad_norm": 0.0, "learning_rate": 2.8280000000000003e-06, "loss": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 103.0078125, "completions/mean_terminated_length": 103.0078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.10764952003955841, "epoch": 0.04056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8336e-06, "loss": 0.0, "num_tokens": 22598918.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11288165301084518, "epoch": 0.04064, "grad_norm": 0.0, "learning_rate": 2.8392000000000003e-06, "loss": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 102.921875, "completions/mean_terminated_length": 102.921875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10737181082367897, "epoch": 0.04072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8448e-06, "loss": 0.0, "num_tokens": 22677628.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10458310320973396, "epoch": 0.0408, "grad_norm": 0.0, "learning_rate": 2.8504e-06, "loss": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 117.109375, "completions/mean_terminated_length": 117.109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.08996330946683884, "epoch": 0.04088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.856e-06, "loss": 0.0, "num_tokens": 22758154.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08821629360318184, "epoch": 0.04096, "grad_norm": 0.0, "learning_rate": 2.8616e-06, "loss": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 93.671875, "completions/mean_terminated_length": 93.671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.09587747603654861, "epoch": 0.04104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8672000000000003e-06, "loss": 0.0, "num_tokens": 22835680.0, "reward": 0.4768017530441284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4768017530441284, "rewards/reward_fn/std": 0.9941276907920837, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09478938952088356, "epoch": 0.04112, "grad_norm": 0.0, "learning_rate": 2.8728e-06, "loss": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 102.2421875, "completions/mean_terminated_length": 102.2421875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.11604179441928864, "epoch": 0.0412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8784e-06, "loss": 0.0, "num_tokens": 22914303.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11469186469912529, "epoch": 0.04128, "grad_norm": 0.0, "learning_rate": 2.884e-06, "loss": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 100.8828125, "completions/mean_terminated_length": 100.8828125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.10793548822402954, "epoch": 0.04136, "frac_reward_zero_std": 0.875, "grad_norm": 0.9031439423561096, "learning_rate": 2.8896e-06, "loss": -0.0017, "num_tokens": 22992752.0, "reward": 0.64774090051651, "reward_std": 0.17951758205890656, "rewards/reward_fn/mean": 0.64774090051651, "rewards/reward_fn/std": 1.2216017246246338, "step": 517 }, { "clip_ratio/high_max": 0.003610862302593887, "clip_ratio/high_mean": 0.0009027155756484717, "clip_ratio/low_mean": 0.003005672828294337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003908388200215995, "entropy": 0.09997102990746498, "epoch": 0.04144, "grad_norm": 0.25070899724960327, "learning_rate": 2.8952000000000002e-06, "loss": 0.0056, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 100.9765625, "completions/mean_terminated_length": 100.9765625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.11052704229950905, "epoch": 0.04152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9008e-06, "loss": 0.0, "num_tokens": 23071213.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11459873989224434, "epoch": 0.0416, "grad_norm": 0.0, "learning_rate": 2.9064000000000002e-06, "loss": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11590122431516647, "epoch": 0.04168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.912e-06, "loss": 0.0, "num_tokens": 23149805.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10989456623792648, "epoch": 0.04176, "grad_norm": 0.0, "learning_rate": 2.9176e-06, "loss": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 105.8359375, "completions/mean_terminated_length": 105.8359375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.125211663544178, "epoch": 0.04184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9232000000000002e-06, "loss": 0.0, "num_tokens": 23228888.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.112376868724823, "epoch": 0.04192, "grad_norm": 0.0, "learning_rate": 2.9288e-06, "loss": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 111.7578125, "completions/mean_terminated_length": 111.7578125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.08393756672739983, "epoch": 0.042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9344000000000002e-06, "loss": 0.0, "num_tokens": 23308729.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0955238789319992, "epoch": 0.04208, "grad_norm": 0.0, "learning_rate": 2.94e-06, "loss": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 105.28125, "completions/mean_terminated_length": 105.28125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.09766561165452003, "epoch": 0.04216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9456e-06, "loss": 0.0, "num_tokens": 23387741.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09913990274071693, "epoch": 0.04224, "grad_norm": 0.0, "learning_rate": 2.9512e-06, "loss": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 104.8671875, "completions/mean_terminated_length": 104.8671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.11352923512458801, "epoch": 0.04232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9568e-06, "loss": 0.0, "num_tokens": 23466700.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09893468022346497, "epoch": 0.0424, "grad_norm": 0.0, "learning_rate": 2.9624000000000002e-06, "loss": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 89.4921875, "completions/mean_terminated_length": 89.4921875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.10253092274069786, "epoch": 0.04248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.968e-06, "loss": 0.0, "num_tokens": 23543691.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10501282662153244, "epoch": 0.04256, "grad_norm": 0.0, "learning_rate": 2.9736e-06, "loss": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 102.5703125, "completions/mean_terminated_length": 102.5703125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.09590747207403183, "epoch": 0.04264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9791999999999998e-06, "loss": 0.0, "num_tokens": 23622356.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09883136674761772, "epoch": 0.04272, "grad_norm": 0.0, "learning_rate": 2.9847999999999998e-06, "loss": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1041860356926918, "epoch": 0.0428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9904e-06, "loss": 0.0, "num_tokens": 23701732.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09968229755759239, "epoch": 0.04288, "grad_norm": 0.0, "learning_rate": 2.9959999999999998e-06, "loss": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 119.3359375, "completions/mean_terminated_length": 119.3359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.09312013909220695, "epoch": 0.04296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0016e-06, "loss": 0.0, "num_tokens": 23782543.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08949526399374008, "epoch": 0.04304, "grad_norm": 0.0, "learning_rate": 3.0071999999999998e-06, "loss": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 103.1796875, "completions/mean_terminated_length": 103.1796875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.11381325125694275, "epoch": 0.04312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0128e-06, "loss": 0.0, "num_tokens": 23861286.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10425405949354172, "epoch": 0.0432, "grad_norm": 0.0, "learning_rate": 3.0184e-06, "loss": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 100.0390625, "completions/mean_terminated_length": 100.0390625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10640548542141914, "epoch": 0.04328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0239999999999998e-06, "loss": 0.0, "num_tokens": 23939627.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11250539496541023, "epoch": 0.04336, "grad_norm": 0.0, "learning_rate": 3.0296e-06, "loss": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 101.1015625, "completions/mean_terminated_length": 101.1015625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.10538192465901375, "epoch": 0.04344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0351999999999998e-06, "loss": 0.0, "num_tokens": 24018104.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10470309853553772, "epoch": 0.04352, "grad_norm": 0.0, "learning_rate": 3.0408e-06, "loss": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 112.296875, "completions/mean_terminated_length": 110.01587677001953, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.09644613042473793, "epoch": 0.0436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0463999999999998e-06, "loss": 0.0, "num_tokens": 24098014.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09140098467469215, "epoch": 0.04368, "grad_norm": 0.0, "learning_rate": 3.0519999999999997e-06, "loss": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 118.796875, "completions/mean_terminated_length": 117.71653747558594, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 0.08893723040819168, "epoch": 0.04376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0576e-06, "loss": 0.0, "num_tokens": 24178756.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09115546941757202, "epoch": 0.04384, "grad_norm": 0.0, "learning_rate": 3.0631999999999997e-06, "loss": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 114.2421875, "completions/mean_terminated_length": 114.2421875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.09072719141840935, "epoch": 0.04392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0688e-06, "loss": 0.0, "num_tokens": 24258915.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0889349915087223, "epoch": 0.044, "grad_norm": 0.0, "learning_rate": 3.0743999999999997e-06, "loss": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 131.4296875, "completions/mean_terminated_length": 129.452392578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.10806973278522491, "epoch": 0.04408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.08e-06, "loss": 0.0, "num_tokens": 24341274.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10341455042362213, "epoch": 0.04416, "grad_norm": 0.0, "learning_rate": 3.0856e-06, "loss": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 108.609375, "completions/mean_terminated_length": 108.609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.09930199384689331, "epoch": 0.04424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0911999999999997e-06, "loss": 0.0, "num_tokens": 24420712.0, "reward": 0.11482523381710052, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11482523381710052, "rewards/reward_fn/std": 0.3049927055835724, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1031503826379776, "epoch": 0.04432, "grad_norm": 0.0, "learning_rate": 3.0968e-06, "loss": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 125.7421875, "completions/mean_terminated_length": 125.7421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.09317844361066818, "epoch": 0.0444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1023999999999997e-06, "loss": 0.0, "num_tokens": 24502343.0, "reward": 0.11482523381710052, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11482523381710052, "rewards/reward_fn/std": 0.3049927055835724, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09576775133609772, "epoch": 0.04448, "grad_norm": 0.0, "learning_rate": 3.108e-06, "loss": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 119.3359375, "completions/mean_terminated_length": 118.25984191894531, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.08646593242883682, "epoch": 0.04456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1135999999999997e-06, "loss": 0.0, "num_tokens": 24583154.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09078627824783325, "epoch": 0.04464, "grad_norm": 0.0, "learning_rate": 3.1192e-06, "loss": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 120.46875, "completions/mean_terminated_length": 120.46875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.09674259275197983, "epoch": 0.04472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1248e-06, "loss": 0.0, "num_tokens": 24664110.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09560255706310272, "epoch": 0.0448, "grad_norm": 0.0, "learning_rate": 3.1303999999999997e-06, "loss": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 116.8515625, "completions/mean_terminated_length": 116.8515625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.08457322418689728, "epoch": 0.04488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.136e-06, "loss": 0.0, "num_tokens": 24744603.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08180662989616394, "epoch": 0.04496, "grad_norm": 0.0, "learning_rate": 3.1415999999999997e-06, "loss": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 127.7421875, "completions/mean_terminated_length": 118.04202270507812, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.10254533588886261, "epoch": 0.04504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1472e-06, "loss": 0.0, "num_tokens": 24826490.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11061909049749374, "epoch": 0.04512, "grad_norm": 0.0, "learning_rate": 3.1528e-06, "loss": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 127.8984375, "completions/mean_terminated_length": 125.8650894165039, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.09707377105951309, "epoch": 0.0452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1583999999999997e-06, "loss": 0.0, "num_tokens": 24908397.0, "reward": 0.4998645484447479, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4998645484447479, "rewards/reward_fn/std": 1.0038613080978394, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09350791200995445, "epoch": 0.04528, "grad_norm": 0.0, "learning_rate": 3.164e-06, "loss": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 103.2265625, "completions/mean_terminated_length": 103.2265625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10792716592550278, "epoch": 0.04536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1695999999999997e-06, "loss": 0.0, "num_tokens": 24987146.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10400049760937691, "epoch": 0.04544, "grad_norm": 0.0, "learning_rate": 3.1752e-06, "loss": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 118.3984375, "completions/mean_terminated_length": 118.3984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.09677662700414658, "epoch": 0.04552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1808e-06, "loss": 0.0, "num_tokens": 25067837.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09737365692853928, "epoch": 0.0456, "grad_norm": 0.0, "learning_rate": 3.1864e-06, "loss": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 122.390625, "completions/mean_terminated_length": 120.26985168457031, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.09899494051933289, "epoch": 0.04568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.192e-06, "loss": 0.0, "num_tokens": 25149039.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09659038484096527, "epoch": 0.04576, "grad_norm": 0.0, "learning_rate": 3.1975999999999997e-06, "loss": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 112.7265625, "completions/mean_terminated_length": 112.7265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.10070900246500969, "epoch": 0.04584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2032e-06, "loss": 0.0, "num_tokens": 25229004.0, "reward": 0.10180176049470901, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10180176049470901, "rewards/reward_fn/std": 0.2704004645347595, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09966763481497765, "epoch": 0.04592, "grad_norm": 0.0, "learning_rate": 3.2087999999999997e-06, "loss": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 119.34375, "completions/mean_terminated_length": 111.4380111694336, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.09006837010383606, "epoch": 0.046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2144e-06, "loss": 0.0, "num_tokens": 25309816.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09142518043518066, "epoch": 0.04608, "grad_norm": 0.0, "learning_rate": 3.22e-06, "loss": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 118.4765625, "completions/mean_terminated_length": 111.7131118774414, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10490916296839714, "epoch": 0.04616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2255999999999997e-06, "loss": 0.0, "num_tokens": 25390517.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10142472386360168, "epoch": 0.04624, "grad_norm": 0.0, "learning_rate": 3.2312e-06, "loss": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 117.1171875, "completions/mean_terminated_length": 117.1171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.08435190841555595, "epoch": 0.04632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2367999999999997e-06, "loss": 0.0, "num_tokens": 25471044.0, "reward": 0.07554597407579422, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07554597407579422, "rewards/reward_fn/std": 0.20066124200820923, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08378316462039948, "epoch": 0.0464, "grad_norm": 0.0, "learning_rate": 3.2424e-06, "loss": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 112.8515625, "completions/mean_terminated_length": 112.8515625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.08868909627199173, "epoch": 0.04648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.248e-06, "loss": 0.0, "num_tokens": 25551025.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08953263610601425, "epoch": 0.04656, "grad_norm": 0.0, "learning_rate": 3.2536e-06, "loss": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 106.171875, "completions/mean_terminated_length": 106.171875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1067977249622345, "epoch": 0.04664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2592e-06, "loss": 0.0, "num_tokens": 25630151.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10521748289465904, "epoch": 0.04672, "grad_norm": 0.0, "learning_rate": 3.2647999999999996e-06, "loss": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 117.8671875, "completions/mean_terminated_length": 117.8671875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.08876101672649384, "epoch": 0.0468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2704e-06, "loss": 0.0, "num_tokens": 25710774.0, "reward": 0.5050700902938843, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5050700902938843, "rewards/reward_fn/std": 0.9907429814338684, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08162789046764374, "epoch": 0.04688, "grad_norm": 0.0, "learning_rate": 3.276e-06, "loss": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 115.6875, "completions/mean_terminated_length": 115.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10192861407995224, "epoch": 0.04696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2816e-06, "loss": 0.0, "num_tokens": 25791118.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09462397918105125, "epoch": 0.04704, "grad_norm": 0.0, "learning_rate": 3.2872e-06, "loss": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 109.7421875, "completions/mean_terminated_length": 109.7421875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.10100723803043365, "epoch": 0.04712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2928e-06, "loss": 0.0, "num_tokens": 25870701.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0983591228723526, "epoch": 0.0472, "grad_norm": 0.0, "learning_rate": 3.2984e-06, "loss": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 116.1171875, "completions/mean_terminated_length": 115.0157470703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.10593640431761742, "epoch": 0.04728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3039999999999996e-06, "loss": 0.0, "num_tokens": 25951100.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10363966226577759, "epoch": 0.04736, "grad_norm": 0.0, "learning_rate": 3.3096e-06, "loss": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 118.7734375, "completions/mean_terminated_length": 117.69290924072266, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.10684658586978912, "epoch": 0.04744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3152e-06, "loss": 0.0, "num_tokens": 26031839.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10523829981684685, "epoch": 0.04752, "grad_norm": 0.0, "learning_rate": 3.3208e-06, "loss": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.09462854266166687, "epoch": 0.0476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3264e-06, "loss": 0.0, "num_tokens": 26113239.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09821495786309242, "epoch": 0.04768, "grad_norm": 0.0, "learning_rate": 3.3319999999999996e-06, "loss": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 114.32257843017578, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.09841631352901459, "epoch": 0.04776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3376e-06, "loss": 0.0, "num_tokens": 26193975.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09719715267419815, "epoch": 0.04784, "grad_norm": 0.0, "learning_rate": 3.3432e-06, "loss": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.10090677067637444, "epoch": 0.04792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3488e-06, "loss": 0.0, "num_tokens": 26274599.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10234848782420158, "epoch": 0.048, "grad_norm": 0.0, "learning_rate": 3.3544e-06, "loss": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.10128725692629814, "epoch": 0.04808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.36e-06, "loss": 0.0, "num_tokens": 26354295.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09961574897170067, "epoch": 0.04816, "grad_norm": 0.0, "learning_rate": 3.3656e-06, "loss": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 108.796875, "completions/mean_terminated_length": 108.796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.10779217258095741, "epoch": 0.04824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3711999999999996e-06, "loss": 0.0, "num_tokens": 26433757.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09762543439865112, "epoch": 0.04832, "grad_norm": 0.0, "learning_rate": 3.3768e-06, "loss": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 102.9375, "completions/mean_terminated_length": 102.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.10663582012057304, "epoch": 0.0484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3824e-06, "loss": 0.0, "num_tokens": 26512469.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10544880852103233, "epoch": 0.04848, "grad_norm": 0.0, "learning_rate": 3.388e-06, "loss": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 111.8984375, "completions/mean_terminated_length": 110.76377868652344, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.09567398577928543, "epoch": 0.04856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3936e-06, "loss": 0.0, "num_tokens": 26592328.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09285252541303635, "epoch": 0.04864, "grad_norm": 0.0, "learning_rate": 3.3991999999999996e-06, "loss": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 110.5390625, "completions/mean_terminated_length": 110.5390625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.10228767618536949, "epoch": 0.04872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4048e-06, "loss": 0.0, "num_tokens": 26672013.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10701350122690201, "epoch": 0.0488, "grad_norm": 0.0, "learning_rate": 3.4104e-06, "loss": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 106.8671875, "completions/mean_terminated_length": 106.8671875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.08858393877744675, "epoch": 0.04888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.416e-06, "loss": 0.0, "num_tokens": 26751228.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08848981559276581, "epoch": 0.04896, "grad_norm": 0.0, "learning_rate": 3.4216e-06, "loss": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 104.9921875, "completions/mean_terminated_length": 104.9921875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.10054391250014305, "epoch": 0.04904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4272e-06, "loss": 0.0, "num_tokens": 26830203.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10278047621250153, "epoch": 0.04912, "grad_norm": 0.0, "learning_rate": 3.4328e-06, "loss": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 113.0859375, "completions/mean_terminated_length": 111.96063232421875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.1050216406583786, "epoch": 0.0492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4384e-06, "loss": 0.0, "num_tokens": 26910214.0, "reward": 0.7817869186401367, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7817869186401367, "rewards/reward_fn/std": 1.2883555889129639, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10192431509494781, "epoch": 0.04928, "grad_norm": 0.0, "learning_rate": 3.444e-06, "loss": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 122.3515625, "completions/mean_terminated_length": 122.3515625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.09873774275183678, "epoch": 0.04936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4496e-06, "loss": 0.0, "num_tokens": 26991411.0, "reward": 0.08572613447904587, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08572613447904587, "rewards/reward_fn/std": 0.22770123183727264, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10901334509253502, "epoch": 0.04944, "grad_norm": 0.0, "learning_rate": 3.4552e-06, "loss": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 108.1875, "completions/mean_terminated_length": 107.02362060546875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10695511847734451, "epoch": 0.04952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4608e-06, "loss": 0.0, "num_tokens": 27070795.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09550732001662254, "epoch": 0.0496, "grad_norm": 0.0, "learning_rate": 3.4664e-06, "loss": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 103.90625, "completions/mean_terminated_length": 103.90625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.09738783165812492, "epoch": 0.04968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472e-06, "loss": 0.0, "num_tokens": 27149631.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10152488574385643, "epoch": 0.04976, "grad_norm": 0.0, "learning_rate": 3.4776e-06, "loss": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 99.265625, "completions/mean_terminated_length": 99.265625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.09529266506433487, "epoch": 0.04984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4832e-06, "loss": 0.0, "num_tokens": 27227873.0, "reward": 0.44060659408569336, "reward_std": 0.0, "rewards/reward_fn/mean": 0.44060659408569336, "rewards/reward_fn/std": 0.9780665636062622, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09267657622694969, "epoch": 0.04992, "grad_norm": 0.0, "learning_rate": 3.4888e-06, "loss": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 107.0, "completions/mean_terminated_length": 107.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.10003218427300453, "epoch": 0.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4944e-06, "loss": 0.0, "num_tokens": 27307105.0, "reward": 0.01492841262370348, "reward_std": 0.0, "rewards/reward_fn/mean": 0.01492841262370348, "rewards/reward_fn/std": 0.039652060717344284, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10089706629514694, "epoch": 0.05008, "grad_norm": 0.0, "learning_rate": 3.5e-06, "loss": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 114.21875, "completions/mean_terminated_length": 114.21875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.09377165511250496, "epoch": 0.05016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999999387592413e-06, "loss": 0.0, "num_tokens": 27387261.0, "reward": 0.12378276884555817, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12378276884555817, "rewards/reward_fn/std": 0.3287852704524994, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09517207369208336, "epoch": 0.05024, "grad_norm": 0.0, "learning_rate": 3.4999997550369713e-06, "loss": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 115.4140625, "completions/mean_terminated_length": 114.30708312988281, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10300083085894585, "epoch": 0.05032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999994488332014e-06, "loss": 0.0, "num_tokens": 27467570.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10271292924880981, "epoch": 0.0504, "grad_norm": 0.0, "learning_rate": 3.499999020147954e-06, "loss": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 100.0078125, "completions/mean_terminated_length": 100.0078125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1318642944097519, "epoch": 0.05048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499998468981258e-06, "loss": 0.0, "num_tokens": 27545907.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12805821001529694, "epoch": 0.05056, "grad_norm": 0.0, "learning_rate": 3.499997795333153e-06, "loss": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 120.6171875, "completions/mean_terminated_length": 120.6171875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.09593478217720985, "epoch": 0.05064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999969992036853e-06, "loss": 0.0, "num_tokens": 27626882.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09880076721310616, "epoch": 0.05072, "grad_norm": 0.0, "learning_rate": 3.4999960805929118e-06, "loss": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 114.40625, "completions/mean_terminated_length": 113.29133605957031, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.09438963234424591, "epoch": 0.0508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999950395008957e-06, "loss": 0.0, "num_tokens": 27707062.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09450766444206238, "epoch": 0.05088, "grad_norm": 0.0, "learning_rate": 3.4999938759277106e-06, "loss": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 130.8359375, "completions/mean_terminated_length": 130.8359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.08405666053295135, "epoch": 0.05096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999925898734376e-06, "loss": 0.0, "num_tokens": 27789345.0, "reward": 0.1095491349697113, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1095491349697113, "rewards/reward_fn/std": 0.2909786105155945, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08975117653608322, "epoch": 0.05104, "grad_norm": 0.0, "learning_rate": 3.4999911813381665e-06, "loss": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 107.171875, "completions/mean_terminated_length": 107.171875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.09579388424754143, "epoch": 0.05112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999896503219965e-06, "loss": 0.0, "num_tokens": 27868599.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09525198116898537, "epoch": 0.0512, "grad_norm": 0.0, "learning_rate": 3.499987996825034e-06, "loss": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 112.9765625, "completions/mean_terminated_length": 112.9765625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.09973648935556412, "epoch": 0.05128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499986220847395e-06, "loss": 0.0, "num_tokens": 27948596.0, "reward": 0.8462333083152771, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8462333083152771, "rewards/reward_fn/std": 1.2730424404144287, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10512340441346169, "epoch": 0.05136, "grad_norm": 0.0, "learning_rate": 3.499984322389204e-06, "loss": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 99.390625, "completions/mean_terminated_length": 99.390625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10271813347935677, "epoch": 0.05144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999823014505936e-06, "loss": 0.0, "num_tokens": 28026854.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11361667886376381, "epoch": 0.05152, "grad_norm": 0.0, "learning_rate": 3.4999801580317055e-06, "loss": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 107.390625, "completions/mean_terminated_length": 106.22047424316406, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.10589085519313812, "epoch": 0.0516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49997789213269e-06, "loss": 0.0, "num_tokens": 28106136.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10258827731013298, "epoch": 0.05168, "grad_norm": 0.0, "learning_rate": 3.499975503753705e-06, "loss": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 101.609375, "completions/mean_terminated_length": 101.609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.11445674300193787, "epoch": 0.05176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499972992894918e-06, "loss": 0.0, "num_tokens": 28184678.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11513804644346237, "epoch": 0.05184, "grad_norm": 0.0, "learning_rate": 3.499970359556505e-06, "loss": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 116.5078125, "completions/mean_terminated_length": 116.5078125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.07584754005074501, "epoch": 0.05192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999676037386494e-06, "loss": 0.0, "num_tokens": 28265127.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07606785744428635, "epoch": 0.052, "grad_norm": 0.0, "learning_rate": 3.4999647254415454e-06, "loss": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 119.8671875, "completions/mean_terminated_length": 117.70635223388672, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.09504968672990799, "epoch": 0.05208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499961724665393e-06, "loss": 0.0, "num_tokens": 28346006.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0949641801416874, "epoch": 0.05216, "grad_norm": 0.0, "learning_rate": 3.499958601410403e-06, "loss": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 117.5078125, "completions/mean_terminated_length": 116.41732025146484, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.08295572176575661, "epoch": 0.05224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499955355676795e-06, "loss": 0.0, "num_tokens": 28426583.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08322695270180702, "epoch": 0.05232, "grad_norm": 0.0, "learning_rate": 3.499951987464794e-06, "loss": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 120.015625, "completions/mean_terminated_length": 120.015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.10048670694231987, "epoch": 0.0524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999484967746374e-06, "loss": 0.0, "num_tokens": 28507481.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09953974187374115, "epoch": 0.05248, "grad_norm": 0.0, "learning_rate": 3.4999448836065688e-06, "loss": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 108.71875, "completions/mean_terminated_length": 108.71875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.08584844693541527, "epoch": 0.05256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499941147960841e-06, "loss": 0.0, "num_tokens": 28586933.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08570488169789314, "epoch": 0.05264, "grad_norm": 0.0, "learning_rate": 3.4999372898377162e-06, "loss": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 110.078125, "completions/mean_terminated_length": 110.078125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.09355699270963669, "epoch": 0.05272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999333092374637e-06, "loss": 0.0, "num_tokens": 28666559.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09329136833548546, "epoch": 0.0528, "grad_norm": 0.0, "learning_rate": 3.4999292061603624e-06, "loss": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 124.1015625, "completions/mean_terminated_length": 124.1015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.08385832607746124, "epoch": 0.05288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4999249806066993e-06, "loss": 0.0, "num_tokens": 28747980.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0859147310256958, "epoch": 0.05296, "grad_norm": 0.0, "learning_rate": 3.4999206325767706e-06, "loss": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 115.2109375, "completions/mean_terminated_length": 114.10236358642578, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.09494730830192566, "epoch": 0.05304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49991616207088e-06, "loss": 0.0, "num_tokens": 28828263.0, "reward": 0.10180176049470901, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10180176049470901, "rewards/reward_fn/std": 0.2704004645347595, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09825869649648666, "epoch": 0.05312, "grad_norm": 0.0, "learning_rate": 3.4999115690893405e-06, "loss": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 103.28125, "completions/mean_terminated_length": 103.28125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.09339979663491249, "epoch": 0.0532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499906853632474e-06, "loss": 0.0, "num_tokens": 28907019.0, "reward": 0.4999995827674866, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4999995827674866, "rewards/reward_fn/std": 1.0039290189743042, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0929812416434288, "epoch": 0.05328, "grad_norm": 0.0, "learning_rate": 3.49990201570061e-06, "loss": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 129.2578125, "completions/mean_terminated_length": 127.24604034423828, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.08779023215174675, "epoch": 0.05336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499897055294088e-06, "loss": 0.0, "num_tokens": 28989100.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08518768846988678, "epoch": 0.05344, "grad_norm": 0.0, "learning_rate": 3.4998919724132537e-06, "loss": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 104.9375, "completions/mean_terminated_length": 104.9375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.09733079373836517, "epoch": 0.05352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499886767058464e-06, "loss": 0.0, "num_tokens": 29068068.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10100510716438293, "epoch": 0.0536, "grad_norm": 0.0, "learning_rate": 3.4998814392300832e-06, "loss": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 117.1953125, "completions/mean_terminated_length": 117.1953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.10263441503047943, "epoch": 0.05368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4998759889284835e-06, "loss": 0.0, "num_tokens": 29148605.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09928787499666214, "epoch": 0.05376, "grad_norm": 0.0, "learning_rate": 3.499870416154047e-06, "loss": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 103.2265625, "completions/mean_terminated_length": 102.02362060546875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.10089743509888649, "epoch": 0.05384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499864720907163e-06, "loss": 0.0, "num_tokens": 29227354.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10277792811393738, "epoch": 0.05392, "grad_norm": 0.0, "learning_rate": 3.4998589031882313e-06, "loss": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 99.7109375, "completions/mean_terminated_length": 99.7109375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11188122630119324, "epoch": 0.054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499852962997658e-06, "loss": 0.0, "num_tokens": 29305653.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10618799924850464, "epoch": 0.05408, "grad_norm": 0.0, "learning_rate": 3.4998469003358594e-06, "loss": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 111.578125, "completions/mean_terminated_length": 111.578125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10962583497166634, "epoch": 0.05416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4998407152032595e-06, "loss": 0.0, "num_tokens": 29385471.0, "reward": 0.8675283193588257, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8675283193588257, "rewards/reward_fn/std": 1.273011565208435, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10478732734918594, "epoch": 0.05424, "grad_norm": 0.0, "learning_rate": 3.4998344076002917e-06, "loss": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 121.79200744628906, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.08677079901099205, "epoch": 0.05432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4998279775273967e-06, "loss": 0.0, "num_tokens": 29466999.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0856078565120697, "epoch": 0.0544, "grad_norm": 0.0, "learning_rate": 3.499821424985025e-06, "loss": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 119.359375, "completions/mean_terminated_length": 117.19048309326172, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.08195074275135994, "epoch": 0.05448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499814749973635e-06, "loss": 0.0, "num_tokens": 29547813.0, "reward": 0.11171221733093262, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11171221733093262, "rewards/reward_fn/std": 0.2967240810394287, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08504123240709305, "epoch": 0.05456, "grad_norm": 0.0, "learning_rate": 3.499807952493695e-06, "loss": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 116.4296875, "completions/mean_terminated_length": 116.4296875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.08802806958556175, "epoch": 0.05464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4998010325456784e-06, "loss": 0.0, "num_tokens": 29628252.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08795046806335449, "epoch": 0.05472, "grad_norm": 0.0, "learning_rate": 3.499793990130072e-06, "loss": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 118.890625, "completions/mean_terminated_length": 118.890625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.09573201462626457, "epoch": 0.0548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499786825247367e-06, "loss": 0.0, "num_tokens": 29709006.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09318869188427925, "epoch": 0.05488, "grad_norm": 0.0, "learning_rate": 3.4997795378980655e-06, "loss": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 107.9453125, "completions/mean_terminated_length": 107.9453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.1022971123456955, "epoch": 0.05496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499772128082678e-06, "loss": 0.0, "num_tokens": 29788359.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09363570436835289, "epoch": 0.05504, "grad_norm": 0.0, "learning_rate": 3.499764595801722e-06, "loss": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 97.3046875, "completions/mean_terminated_length": 97.3046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.0957718938589096, "epoch": 0.05512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4997569410557258e-06, "loss": 0.0, "num_tokens": 29866350.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09089522808790207, "epoch": 0.0552, "grad_norm": 0.0, "learning_rate": 3.499749163845224e-06, "loss": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.08941814303398132, "epoch": 0.05528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499741264170762e-06, "loss": 0.0, "num_tokens": 29946670.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09295230731368065, "epoch": 0.05536, "grad_norm": 0.0, "learning_rate": 3.4997332420328926e-06, "loss": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 116.953125, "completions/mean_terminated_length": 116.953125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.09887906908988953, "epoch": 0.05544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499725097432177e-06, "loss": 0.0, "num_tokens": 30027176.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09654634445905685, "epoch": 0.05552, "grad_norm": 0.0, "learning_rate": 3.4997168303691844e-06, "loss": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 122.484375, "completions/mean_terminated_length": 121.43306732177734, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.0863012969493866, "epoch": 0.0556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4997084408444946e-06, "loss": 0.0, "num_tokens": 30108390.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08105605095624924, "epoch": 0.05568, "grad_norm": 0.0, "learning_rate": 3.4996999288586943e-06, "loss": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 104.5546875, "completions/mean_terminated_length": 104.5546875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.08684713765978813, "epoch": 0.05576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4996912944123794e-06, "loss": 0.0, "num_tokens": 30187309.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08490288630127907, "epoch": 0.05584, "grad_norm": 0.0, "learning_rate": 3.499682537506154e-06, "loss": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 111.6015625, "completions/mean_terminated_length": 111.6015625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.08757134154438972, "epoch": 0.05592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499673658140631e-06, "loss": 0.0, "num_tokens": 30267130.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08592883870005608, "epoch": 0.056, "grad_norm": 0.0, "learning_rate": 3.4996646563164323e-06, "loss": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 131.390625, "completions/mean_terminated_length": 128.40000915527344, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.08260359615087509, "epoch": 0.05608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4996555320341873e-06, "loss": 0.0, "num_tokens": 30349484.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07872408255934715, "epoch": 0.05616, "grad_norm": 0.0, "learning_rate": 3.499646285294535e-06, "loss": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 111.5390625, "completions/mean_terminated_length": 111.5390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.09274516627192497, "epoch": 0.05624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499636916098123e-06, "loss": 0.0, "num_tokens": 30429297.0, "reward": 0.790934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.790934681892395, "rewards/reward_fn/std": 1.2848050594329834, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09607721492648125, "epoch": 0.05632, "grad_norm": 0.0, "learning_rate": 3.499627424445606e-06, "loss": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 117.9921875, "completions/mean_terminated_length": 117.9921875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.11880143731832504, "epoch": 0.0564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499617810337649e-06, "loss": 0.0, "num_tokens": 30509936.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10129240900278091, "epoch": 0.05648, "grad_norm": 0.0, "learning_rate": 3.499608073774924e-06, "loss": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 116.7578125, "completions/mean_terminated_length": 116.7578125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.08667147532105446, "epoch": 0.05656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499598214758114e-06, "loss": 0.0, "num_tokens": 30590417.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08805737644433975, "epoch": 0.05664, "grad_norm": 0.0, "learning_rate": 3.4995882332879085e-06, "loss": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 119.9765625, "completions/mean_terminated_length": 119.9765625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.09640844911336899, "epoch": 0.05672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4995781293650052e-06, "loss": 0.0, "num_tokens": 30671310.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09679620712995529, "epoch": 0.0568, "grad_norm": 0.0, "learning_rate": 3.499567902990112e-06, "loss": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 109.0234375, "completions/mean_terminated_length": 109.0234375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1008693017065525, "epoch": 0.05688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4995575541639444e-06, "loss": 0.0, "num_tokens": 30750801.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.103708915412426, "epoch": 0.05696, "grad_norm": 0.0, "learning_rate": 3.4995470828872272e-06, "loss": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 104.1015625, "completions/mean_terminated_length": 104.1015625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.10902595520019531, "epoch": 0.05704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4995364891606926e-06, "loss": 0.0, "num_tokens": 30829662.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10811987891793251, "epoch": 0.05712, "grad_norm": 0.0, "learning_rate": 3.4995257729850824e-06, "loss": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 118.015625, "completions/mean_terminated_length": 118.015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.0969587154686451, "epoch": 0.0572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4995149343611468e-06, "loss": 0.0, "num_tokens": 30910304.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09609462693333626, "epoch": 0.05728, "grad_norm": 0.0, "learning_rate": 3.499503973289644e-06, "loss": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 120.6875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.08243872225284576, "epoch": 0.05736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4994928897713413e-06, "loss": 0.0, "num_tokens": 30991288.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0850922092795372, "epoch": 0.05744, "grad_norm": 0.0, "learning_rate": 3.4994816838070143e-06, "loss": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 102.0859375, "completions/mean_terminated_length": 100.87401580810547, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.113995760679245, "epoch": 0.05752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4994703553974476e-06, "loss": 0.0, "num_tokens": 31069891.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11316899210214615, "epoch": 0.0576, "grad_norm": 0.0, "learning_rate": 3.499458904543434e-06, "loss": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 107.3046875, "completions/mean_terminated_length": 107.3046875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.08068143203854561, "epoch": 0.05768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499447331245775e-06, "loss": 0.0, "num_tokens": 31149162.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08082816377282143, "epoch": 0.05776, "grad_norm": 0.0, "learning_rate": 3.4994356355052802e-06, "loss": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 108.7741928100586, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.09366168454289436, "epoch": 0.05784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499423817322769e-06, "loss": 0.0, "num_tokens": 31229210.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0968434289097786, "epoch": 0.05792, "grad_norm": 0.0, "learning_rate": 3.499411876699067e-06, "loss": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 119.9375, "completions/mean_terminated_length": 118.86614227294922, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.09615499898791313, "epoch": 0.058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4993998136350117e-06, "loss": 0.0, "num_tokens": 31310098.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09041160717606544, "epoch": 0.05808, "grad_norm": 0.0, "learning_rate": 3.4993876281314463e-06, "loss": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 114.1875, "completions/mean_terminated_length": 114.1875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.09514517709612846, "epoch": 0.05816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499375320189224e-06, "loss": 0.0, "num_tokens": 31390250.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09269405528903008, "epoch": 0.05824, "grad_norm": 0.0, "learning_rate": 3.4993628898092056e-06, "loss": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 110.8515625, "completions/mean_terminated_length": 110.8515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.0815260261297226, "epoch": 0.05832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499350336992263e-06, "loss": 0.0, "num_tokens": 31469975.0, "reward": 0.054613638669252396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.054613638669252396, "rewards/reward_fn/std": 0.13055464625358582, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08185720071196556, "epoch": 0.0584, "grad_norm": 0.0, "learning_rate": 3.4993376617392724e-06, "loss": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 116.703125, "completions/mean_terminated_length": 116.703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.10925139859318733, "epoch": 0.05848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4993248640511223e-06, "loss": 0.0, "num_tokens": 31550449.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09997100383043289, "epoch": 0.05856, "grad_norm": 0.0, "learning_rate": 3.499311943928708e-06, "loss": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 102.046875, "completions/mean_terminated_length": 102.046875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.09284191206097603, "epoch": 0.05864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499298901372934e-06, "loss": 0.0, "num_tokens": 31629047.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08424169197678566, "epoch": 0.05872, "grad_norm": 0.0, "learning_rate": 3.499285736384713e-06, "loss": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 108.6640625, "completions/mean_terminated_length": 108.6640625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11024625599384308, "epoch": 0.0588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4992724489649663e-06, "loss": 0.0, "num_tokens": 31708492.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09919709712266922, "epoch": 0.05888, "grad_norm": 0.0, "learning_rate": 3.4992590391146237e-06, "loss": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 118.28125, "completions/mean_terminated_length": 118.28125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.09181509166955948, "epoch": 0.05896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4992455068346243e-06, "loss": 0.0, "num_tokens": 31789168.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09521148353815079, "epoch": 0.05904, "grad_norm": 0.0, "learning_rate": 3.499231852125915e-06, "loss": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 110.4375, "completions/mean_terminated_length": 110.4375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.08400151133537292, "epoch": 0.05912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499218074989451e-06, "loss": 0.0, "num_tokens": 31868840.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07993270456790924, "epoch": 0.0592, "grad_norm": 0.0, "learning_rate": 3.4992041754261976e-06, "loss": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 100.328125, "completions/mean_terminated_length": 100.328125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.12014191597700119, "epoch": 0.05928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4991901534371267e-06, "loss": 0.0, "num_tokens": 31947218.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12176444008946419, "epoch": 0.05936, "grad_norm": 0.0, "learning_rate": 3.49917600902322e-06, "loss": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 111.1484375, "completions/mean_terminated_length": 111.1484375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.09398579970002174, "epoch": 0.05944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499161742185468e-06, "loss": 0.0, "num_tokens": 32026981.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09525588527321815, "epoch": 0.05952, "grad_norm": 0.0, "learning_rate": 3.499147352924868e-06, "loss": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 116.171875, "completions/mean_terminated_length": 112.81600189208984, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.09242620691657066, "epoch": 0.0596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499132841242428e-06, "loss": 0.0, "num_tokens": 32107387.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09450759366154671, "epoch": 0.05968, "grad_norm": 0.0, "learning_rate": 3.499118207139164e-06, "loss": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 112.4765625, "completions/mean_terminated_length": 109.0320053100586, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.11324436217546463, "epoch": 0.05976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.499103450616099e-06, "loss": 0.0, "num_tokens": 32187320.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09757941961288452, "epoch": 0.05984, "grad_norm": 0.0, "learning_rate": 3.499088571674266e-06, "loss": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 118.8984375, "completions/mean_terminated_length": 103.39999389648438, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.11636082082986832, "epoch": 0.05992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4990735703147076e-06, "loss": 0.0, "num_tokens": 32268075.0, "reward": 0.7770648002624512, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7770648002624512, "rewards/reward_fn/std": 1.2903637886047363, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11692037060856819, "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 3.4990584465384727e-06, "loss": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 103.6640625, "completions/mean_terminated_length": 103.6640625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.11251771077513695, "epoch": 0.06008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4990432003466206e-06, "loss": 0.0, "num_tokens": 32346880.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11235464736819267, "epoch": 0.06016, "grad_norm": 0.0, "learning_rate": 3.499027831740217e-06, "loss": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 102.40625, "completions/mean_terminated_length": 102.40625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.10148745775222778, "epoch": 0.06024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4990123407203394e-06, "loss": 0.0, "num_tokens": 32425524.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1021200679242611, "epoch": 0.06032, "grad_norm": 0.0, "learning_rate": 3.4989967272880704e-06, "loss": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 104.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.11212345585227013, "epoch": 0.0604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4989809914445035e-06, "loss": 0.0, "num_tokens": 32504404.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11259425804018974, "epoch": 0.06048, "grad_norm": 0.0, "learning_rate": 3.49896513319074e-06, "loss": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 107.53125, "completions/mean_terminated_length": 107.53125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.08899083361029625, "epoch": 0.06056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4989491525278896e-06, "loss": 0.0, "num_tokens": 32583704.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08117613941431046, "epoch": 0.06064, "grad_norm": 0.0, "learning_rate": 3.498933049457071e-06, "loss": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 87.5703125, "completions/mean_terminated_length": 87.5703125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.12863566353917122, "epoch": 0.06072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4989168239794113e-06, "loss": 0.0, "num_tokens": 32660449.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.129712525755167, "epoch": 0.0608, "grad_norm": 0.0, "learning_rate": 3.4989004760960456e-06, "loss": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 106.5859375, "completions/mean_terminated_length": 106.5859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.10160820558667183, "epoch": 0.06088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498884005808119e-06, "loss": 0.0, "num_tokens": 32739628.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09853312373161316, "epoch": 0.06096, "grad_norm": 0.0, "learning_rate": 3.498867413116783e-06, "loss": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 121.6796875, "completions/mean_terminated_length": 121.6796875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.09850538522005081, "epoch": 0.06104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4988506980232004e-06, "loss": 0.0, "num_tokens": 32820739.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09899640828371048, "epoch": 0.06112, "grad_norm": 0.0, "learning_rate": 3.49883386052854e-06, "loss": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 105.96875, "completions/mean_terminated_length": 105.96875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1167539581656456, "epoch": 0.0612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49881690063398e-06, "loss": 0.0, "num_tokens": 32899839.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11241551488637924, "epoch": 0.06128, "grad_norm": 0.0, "learning_rate": 3.4987998183407087e-06, "loss": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 88.5546875, "completions/mean_terminated_length": 85.8968276977539, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.10758354514837265, "epoch": 0.06136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49878261364992e-06, "loss": 0.0, "num_tokens": 32976710.0, "reward": 1.1614141464233398, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1614141464233398, "rewards/reward_fn/std": 1.4327465295791626, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11870747432112694, "epoch": 0.06144, "grad_norm": 0.0, "learning_rate": 3.4987652865628203e-06, "loss": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 125.96875, "completions/mean_terminated_length": 121.7741928100586, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.0967794880270958, "epoch": 0.06152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49874783708062e-06, "loss": 0.0, "num_tokens": 33058370.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09248407185077667, "epoch": 0.0616, "grad_norm": 0.0, "learning_rate": 3.4987302652045416e-06, "loss": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 107.4921875, "completions/mean_terminated_length": 107.4921875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.08877392113208771, "epoch": 0.06168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4987125709358144e-06, "loss": 0.0, "num_tokens": 33137665.0, "reward": 0.4327646493911743, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4327646493911743, "rewards/reward_fn/std": 0.985901951789856, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08308897167444229, "epoch": 0.06176, "grad_norm": 0.0, "learning_rate": 3.4986947542756778e-06, "loss": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 121.171875, "completions/mean_terminated_length": 121.171875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.09174314513802528, "epoch": 0.06184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4986768152253776e-06, "loss": 0.0, "num_tokens": 33218711.0, "reward": 0.04315175488591194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04315175488591194, "rewards/reward_fn/std": 0.11461740732192993, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08823641017079353, "epoch": 0.06192, "grad_norm": 0.0, "learning_rate": 3.49865875378617e-06, "loss": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 101.6640625, "completions/mean_terminated_length": 101.6640625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.08026442676782608, "epoch": 0.062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498640569959319e-06, "loss": 0.0, "num_tokens": 33297260.0, "reward": 0.5067325234413147, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5067325234413147, "rewards/reward_fn/std": 0.9860954880714417, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07964218035340309, "epoch": 0.06208, "grad_norm": 0.0, "learning_rate": 3.4986222637460978e-06, "loss": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 113.7109375, "completions/mean_terminated_length": 106.7131118774414, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.0927111878991127, "epoch": 0.06216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4986038351477864e-06, "loss": 0.0, "num_tokens": 33377351.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08519364520907402, "epoch": 0.06224, "grad_norm": 0.0, "learning_rate": 3.498585284165675e-06, "loss": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 135.4453125, "completions/mean_terminated_length": 128.4710693359375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.09635230898857117, "epoch": 0.06232, "frac_reward_zero_std": 0.875, "grad_norm": 0.2685765027999878, "learning_rate": 3.498566610801063e-06, "loss": -0.0257, "num_tokens": 33460224.0, "reward": 0.328125, "reward_std": 0.1875, "rewards/reward_fn/mean": 0.328125, "rewards/reward_fn/std": 1.0125929117202759, "step": 779 }, { "clip_ratio/high_max": 0.011934712063521147, "clip_ratio/high_mean": 0.0029836780158802867, "clip_ratio/low_mean": 0.0005296610179357231, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003513338975608349, "entropy": 0.10119850933551788, "epoch": 0.0624, "grad_norm": 0.8919374942779541, "learning_rate": 3.498547815055256e-06, "loss": 0.0424, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 129.421875, "completions/mean_terminated_length": 129.421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.09761578217148781, "epoch": 0.06248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4985288969295706e-06, "loss": 0.0, "num_tokens": 33542326.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09616047143936157, "epoch": 0.06256, "grad_norm": 0.0, "learning_rate": 3.4985098564253298e-06, "loss": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 129.578125, "completions/mean_terminated_length": 129.578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.08275605738162994, "epoch": 0.06264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498490693543867e-06, "loss": 0.0, "num_tokens": 33624448.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0782439336180687, "epoch": 0.06272, "grad_norm": 0.0, "learning_rate": 3.498471408286524e-06, "loss": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 123.11023712158203, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.08557422086596489, "epoch": 0.0628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4984520006546485e-06, "loss": 0.0, "num_tokens": 33705875.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09300351142883301, "epoch": 0.06288, "grad_norm": 0.0, "learning_rate": 3.498432470649601e-06, "loss": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 106.171875, "completions/mean_terminated_length": 106.171875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.11804697662591934, "epoch": 0.06296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498412818272747e-06, "loss": 0.0, "num_tokens": 33785001.0, "reward": 1.1324909925460815, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1324909925460815, "rewards/reward_fn/std": 1.452374815940857, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10385850071907043, "epoch": 0.06304, "grad_norm": 0.0, "learning_rate": 3.4983930435254633e-06, "loss": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 135.546875, "completions/mean_terminated_length": 133.6349334716797, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.09824970737099648, "epoch": 0.06312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498373146409132e-06, "loss": 0.0, "num_tokens": 33867887.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10164246335625648, "epoch": 0.0632, "grad_norm": 0.0, "learning_rate": 3.4983531269251474e-06, "loss": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 110.4609375, "completions/mean_terminated_length": 110.4609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.09458594024181366, "epoch": 0.06328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4983329850749096e-06, "loss": 0.0, "num_tokens": 33947562.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10312845557928085, "epoch": 0.06336, "grad_norm": 0.0, "learning_rate": 3.498312720859829e-06, "loss": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 126.5234375, "completions/mean_terminated_length": 126.5234375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.0905112475156784, "epoch": 0.06344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4982923342813233e-06, "loss": 0.0, "num_tokens": 34029293.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09203771874308586, "epoch": 0.06352, "grad_norm": 0.0, "learning_rate": 3.49827182534082e-06, "loss": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 103.9453125, "completions/mean_terminated_length": 103.9453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.11540063843131065, "epoch": 0.0636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4982511940397545e-06, "loss": 0.0, "num_tokens": 34108134.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11504969000816345, "epoch": 0.06368, "grad_norm": 0.0, "learning_rate": 3.4982304403795696e-06, "loss": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 113.1953125, "completions/mean_terminated_length": 113.1953125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.10319552198052406, "epoch": 0.06376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4982095643617193e-06, "loss": 0.0, "num_tokens": 34188159.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10622233152389526, "epoch": 0.06384, "grad_norm": 0.0, "learning_rate": 3.4981885659876634e-06, "loss": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 117.328125, "completions/mean_terminated_length": 117.328125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.10573950782418251, "epoch": 0.06392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4981674452588734e-06, "loss": 0.0, "num_tokens": 34268713.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10444517433643341, "epoch": 0.064, "grad_norm": 0.0, "learning_rate": 3.4981462021768254e-06, "loss": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 125.8359375, "completions/mean_terminated_length": 123.76985168457031, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.11279943957924843, "epoch": 0.06408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498124836743008e-06, "loss": 0.0, "num_tokens": 34350356.0, "reward": 0.1562952846288681, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1562952846288681, "rewards/reward_fn/std": 0.3059597611427307, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10914979875087738, "epoch": 0.06416, "grad_norm": 0.0, "learning_rate": 3.498103348958915e-06, "loss": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 107.203125, "completions/mean_terminated_length": 107.203125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.1122165396809578, "epoch": 0.06424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.498081738826051e-06, "loss": 0.0, "num_tokens": 34429614.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1182590089738369, "epoch": 0.06432, "grad_norm": 0.0, "learning_rate": 3.4980600063459287e-06, "loss": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 102.1953125, "completions/mean_terminated_length": 100.9842529296875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.137994222342968, "epoch": 0.0644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4980381515200687e-06, "loss": 0.0, "num_tokens": 34508231.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1328241489827633, "epoch": 0.06448, "grad_norm": 0.0, "learning_rate": 3.4980161743500014e-06, "loss": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 120.2890625, "completions/mean_terminated_length": 118.13492584228516, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.12614211440086365, "epoch": 0.06456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4979940748372642e-06, "loss": 0.0, "num_tokens": 34589164.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10559667646884918, "epoch": 0.06464, "grad_norm": 0.0, "learning_rate": 3.4979718529834037e-06, "loss": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 111.328125, "completions/mean_terminated_length": 111.328125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10277532786130905, "epoch": 0.06472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497949508789976e-06, "loss": 0.0, "num_tokens": 34668950.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10080163180828094, "epoch": 0.0648, "grad_norm": 0.0, "learning_rate": 3.4979270422585446e-06, "loss": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 104.7890625, "completions/mean_terminated_length": 104.7890625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.09430145099759102, "epoch": 0.06488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497904453390681e-06, "loss": 0.0, "num_tokens": 34747899.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09066309779882431, "epoch": 0.06496, "grad_norm": 0.0, "learning_rate": 3.4978817421879685e-06, "loss": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 99.59375, "completions/mean_terminated_length": 98.3622055053711, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.11684931442141533, "epoch": 0.06504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4978589086519943e-06, "loss": 0.0, "num_tokens": 34826183.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11259857565164566, "epoch": 0.06512, "grad_norm": 0.0, "learning_rate": 3.4978359527843573e-06, "loss": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 129.4453125, "completions/mean_terminated_length": 124.30081176757812, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.10966474562883377, "epoch": 0.0652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497812874586664e-06, "loss": 0.0, "num_tokens": 34908288.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11553725600242615, "epoch": 0.06528, "grad_norm": 0.0, "learning_rate": 3.497789674060531e-06, "loss": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 115.59375, "completions/mean_terminated_length": 115.59375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.1058315746486187, "epoch": 0.06536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4977663512075796e-06, "loss": 0.0, "num_tokens": 34988620.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10591333359479904, "epoch": 0.06544, "grad_norm": 0.0, "learning_rate": 3.497742906029444e-06, "loss": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 105.328125, "completions/mean_terminated_length": 105.328125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1233566664159298, "epoch": 0.06552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497719338527765e-06, "loss": 0.0, "num_tokens": 35067638.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1189083680510521, "epoch": 0.0656, "grad_norm": 0.0, "learning_rate": 3.4976956487041917e-06, "loss": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 120.28125, "completions/mean_terminated_length": 119.21260070800781, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11341621726751328, "epoch": 0.06568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4976718365603816e-06, "loss": 0.0, "num_tokens": 35148570.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11707345768809319, "epoch": 0.06576, "grad_norm": 0.0, "learning_rate": 3.4976479020980022e-06, "loss": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 113.2578125, "completions/mean_terminated_length": 110.99207305908203, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.13158171623945236, "epoch": 0.06584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4976238453187285e-06, "loss": 0.0, "num_tokens": 35228603.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1345568150281906, "epoch": 0.06592, "grad_norm": 0.0, "learning_rate": 3.4975996662242435e-06, "loss": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10352106019854546, "epoch": 0.066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49757536481624e-06, "loss": 0.0, "num_tokens": 35310259.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12012812122702599, "epoch": 0.06608, "grad_norm": 0.0, "learning_rate": 3.4975509410964195e-06, "loss": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 116.140625, "completions/mean_terminated_length": 115.03936767578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.13679703325033188, "epoch": 0.06616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4975263950664905e-06, "loss": 0.0, "num_tokens": 35390661.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11876184865832329, "epoch": 0.06624, "grad_norm": 0.0, "learning_rate": 3.4975017267281706e-06, "loss": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 118.6171875, "completions/mean_terminated_length": 117.53543090820312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.1330471783876419, "epoch": 0.06632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4974769360831877e-06, "loss": 0.0, "num_tokens": 35471380.0, "reward": 0.5980561971664429, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5980561971664429, "rewards/reward_fn/std": 0.9873289465904236, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1309695616364479, "epoch": 0.0664, "grad_norm": 0.0, "learning_rate": 3.4974520231332757e-06, "loss": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.10963369160890579, "epoch": 0.06648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497426987880179e-06, "loss": 0.0, "num_tokens": 35551396.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10905405506491661, "epoch": 0.06656, "grad_norm": 0.0, "learning_rate": 3.4974018303256497e-06, "loss": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 122.0703125, "completions/mean_terminated_length": 121.0157470703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11840715259313583, "epoch": 0.06664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.497376550471447e-06, "loss": 0.0, "num_tokens": 35632557.0, "reward": 0.051705554127693176, "reward_std": 0.0, "rewards/reward_fn/mean": 0.051705554127693176, "rewards/reward_fn/std": 0.1373375654220581, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11903255060315132, "epoch": 0.06672, "grad_norm": 0.0, "learning_rate": 3.497351148319343e-06, "loss": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 108.4453125, "completions/mean_terminated_length": 108.4453125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.13850709795951843, "epoch": 0.0668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4973256238711135e-06, "loss": 0.0, "num_tokens": 35711974.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1275894194841385, "epoch": 0.06688, "grad_norm": 0.0, "learning_rate": 3.4972999771285455e-06, "loss": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 114.4765625, "completions/mean_terminated_length": 114.4765625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.10174395143985748, "epoch": 0.06696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4972742080934345e-06, "loss": 0.0, "num_tokens": 35792163.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10423779860138893, "epoch": 0.06704, "grad_norm": 0.0, "learning_rate": 3.497248316767583e-06, "loss": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.10799278318881989, "epoch": 0.06712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4972223031528046e-06, "loss": 0.0, "num_tokens": 35871987.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1055571399629116, "epoch": 0.0672, "grad_norm": 0.0, "learning_rate": 3.4971961672509186e-06, "loss": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 112.171875, "completions/mean_terminated_length": 108.72000885009766, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.13728874176740646, "epoch": 0.06728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4971699090637546e-06, "loss": 0.0, "num_tokens": 35951881.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.125979445874691, "epoch": 0.06736, "grad_norm": 0.0, "learning_rate": 3.4971435285931508e-06, "loss": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 128.46875, "completions/mean_terminated_length": 128.46875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.10363177210092545, "epoch": 0.06744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4971170258409536e-06, "loss": 0.0, "num_tokens": 36033861.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09931762516498566, "epoch": 0.06752, "grad_norm": 0.0, "learning_rate": 3.4970904008090167e-06, "loss": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 107.046875, "completions/mean_terminated_length": 107.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10813192650675774, "epoch": 0.0676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4970636534992053e-06, "loss": 0.0, "num_tokens": 36113099.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10993487015366554, "epoch": 0.06768, "grad_norm": 0.0, "learning_rate": 3.4970367839133906e-06, "loss": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.10269782319664955, "epoch": 0.06776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4970097920534534e-06, "loss": 0.0, "num_tokens": 36194763.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10625327751040459, "epoch": 0.06784, "grad_norm": 0.0, "learning_rate": 3.4969826779212825e-06, "loss": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 97.8984375, "completions/mean_terminated_length": 96.6535415649414, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.1202537789940834, "epoch": 0.06792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4969554415187753e-06, "loss": 0.0, "num_tokens": 36272830.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12555000185966492, "epoch": 0.068, "grad_norm": 0.0, "learning_rate": 3.4969280828478393e-06, "loss": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 94.765625, "completions/mean_terminated_length": 94.765625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10792607069015503, "epoch": 0.06808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496900601910388e-06, "loss": 0.0, "num_tokens": 36350496.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11125465855002403, "epoch": 0.06816, "grad_norm": 0.0, "learning_rate": 3.4968729987083453e-06, "loss": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 103.453125, "completions/mean_terminated_length": 103.453125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11523385345935822, "epoch": 0.06824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496845273243643e-06, "loss": 0.0, "num_tokens": 36429274.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11290798336267471, "epoch": 0.06832, "grad_norm": 0.0, "learning_rate": 3.496817425518222e-06, "loss": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 125.7734375, "completions/mean_terminated_length": 125.7734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10216069221496582, "epoch": 0.0684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4967894555340315e-06, "loss": 0.0, "num_tokens": 36510909.0, "reward": 0.44561243057250977, "reward_std": 0.0, "rewards/reward_fn/mean": 0.44561243057250977, "rewards/reward_fn/std": 0.9868775606155396, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10722962766885757, "epoch": 0.06848, "grad_norm": 0.0, "learning_rate": 3.4967613632930283e-06, "loss": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 119.6640625, "completions/mean_terminated_length": 115.26612854003906, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.10347488150000572, "epoch": 0.06856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496733148797179e-06, "loss": 0.0, "num_tokens": 36591762.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10685179755091667, "epoch": 0.06864, "grad_norm": 0.0, "learning_rate": 3.4967048120484588e-06, "loss": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 129.15625, "completions/mean_terminated_length": 122.91802215576172, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.11947173997759819, "epoch": 0.06872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4966763530488492e-06, "loss": 0.0, "num_tokens": 36673830.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12749796360731125, "epoch": 0.0688, "grad_norm": 0.0, "learning_rate": 3.4966477718003444e-06, "loss": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 107.5859375, "completions/mean_terminated_length": 107.5859375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.11710721999406815, "epoch": 0.06888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496619068304943e-06, "loss": 0.0, "num_tokens": 36753137.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1092463806271553, "epoch": 0.06896, "grad_norm": 0.0, "learning_rate": 3.496590242564655e-06, "loss": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 109.4375, "completions/mean_terminated_length": 109.4375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.11387811228632927, "epoch": 0.06904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496561294581497e-06, "loss": 0.0, "num_tokens": 36832681.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11943633109331131, "epoch": 0.06912, "grad_norm": 0.0, "learning_rate": 3.496532224357496e-06, "loss": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 114.2421875, "completions/mean_terminated_length": 113.12598419189453, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.11999671533703804, "epoch": 0.0692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496503031894686e-06, "loss": 0.0, "num_tokens": 36912840.0, "reward": 0.4844839572906494, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4844839572906494, "rewards/reward_fn/std": 0.9722241163253784, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11201747879385948, "epoch": 0.06928, "grad_norm": 0.0, "learning_rate": 3.496473717195111e-06, "loss": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.09580185264348984, "epoch": 0.06936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496444280260821e-06, "loss": 0.0, "num_tokens": 36992616.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09823241457343102, "epoch": 0.06944, "grad_norm": 0.0, "learning_rate": 3.4964147210938775e-06, "loss": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 137.6015625, "completions/mean_terminated_length": 123.0614013671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.11445628479123116, "epoch": 0.06952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4963850396963497e-06, "loss": 0.0, "num_tokens": 37075765.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273335456848, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1100926510989666, "epoch": 0.0696, "grad_norm": 0.0, "learning_rate": 3.4963552360703142e-06, "loss": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 115.359375, "completions/mean_terminated_length": 115.359375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11426214501261711, "epoch": 0.06968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4963253102178573e-06, "loss": 0.0, "num_tokens": 37156067.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11235499382019043, "epoch": 0.06976, "grad_norm": 0.0, "learning_rate": 3.4962952621410728e-06, "loss": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 100.578125, "completions/mean_terminated_length": 99.35433197021484, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.11104708537459373, "epoch": 0.06984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4962650918420647e-06, "loss": 0.0, "num_tokens": 37234477.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1250855177640915, "epoch": 0.06992, "grad_norm": 0.0, "learning_rate": 3.4962347993229442e-06, "loss": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 113.375, "completions/mean_terminated_length": 108.7741928100586, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.12415875121951103, "epoch": 0.07, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4962043845858316e-06, "loss": 0.0, "num_tokens": 37314525.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12016966566443443, "epoch": 0.07008, "grad_norm": 0.0, "learning_rate": 3.4961738476328554e-06, "loss": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 110.4296875, "completions/mean_terminated_length": 108.11905670166016, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.09231389313936234, "epoch": 0.07016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496143188466153e-06, "loss": 0.0, "num_tokens": 37394196.0, "reward": 0.12130649387836456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12130649387836456, "rewards/reward_fn/std": 0.3222079277038574, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09770107641816139, "epoch": 0.07024, "grad_norm": 0.0, "learning_rate": 3.49611240708787e-06, "loss": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 116.4296875, "completions/mean_terminated_length": 116.4296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.08702141791582108, "epoch": 0.07032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496081503500161e-06, "loss": 0.0, "num_tokens": 37474635.0, "reward": 0.08756738901138306, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08756738901138306, "rewards/reward_fn/std": 0.1597065031528473, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08601841330528259, "epoch": 0.0704, "grad_norm": 0.0, "learning_rate": 3.496050477705189e-06, "loss": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 110.5078125, "completions/mean_terminated_length": 110.5078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.11610795557498932, "epoch": 0.07048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.496019329705125e-06, "loss": 0.0, "num_tokens": 37554316.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10968328639864922, "epoch": 0.07056, "grad_norm": 0.0, "learning_rate": 3.4959880595021496e-06, "loss": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 114.6328125, "completions/mean_terminated_length": 114.6328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.10188555344939232, "epoch": 0.07064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495956667098451e-06, "loss": 0.0, "num_tokens": 37634525.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10063740238547325, "epoch": 0.07072, "grad_norm": 0.0, "learning_rate": 3.4959251524962267e-06, "loss": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 104.8828125, "completions/mean_terminated_length": 104.8828125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11122798919677734, "epoch": 0.0708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4958935156976826e-06, "loss": 0.0, "num_tokens": 37713486.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11372562497854233, "epoch": 0.07088, "grad_norm": 0.0, "learning_rate": 3.4958617567050317e-06, "loss": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 130.9453125, "completions/mean_terminated_length": 126.91128540039062, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.10303831100463867, "epoch": 0.07096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495829875520498e-06, "loss": 0.0, "num_tokens": 37795783.0, "reward": 0.12474323064088821, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12474323064088821, "rewards/reward_fn/std": 0.3313363790512085, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09243791550397873, "epoch": 0.07104, "grad_norm": 0.0, "learning_rate": 3.4957978721463126e-06, "loss": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 100.78739929199219, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12667231261730194, "epoch": 0.07112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4957657465847146e-06, "loss": 0.0, "num_tokens": 37874375.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11985263228416443, "epoch": 0.0712, "grad_norm": 0.0, "learning_rate": 3.495733498837954e-06, "loss": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 124.1640625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11374783888459206, "epoch": 0.07128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4957011289082863e-06, "loss": 0.0, "num_tokens": 37955804.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10234251245856285, "epoch": 0.07136, "grad_norm": 0.0, "learning_rate": 3.495668636797978e-06, "loss": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 121.8828125, "completions/mean_terminated_length": 120.82677459716797, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.11156263947486877, "epoch": 0.07144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4956360225093023e-06, "loss": 0.0, "num_tokens": 38036941.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11092439293861389, "epoch": 0.07152, "grad_norm": 0.0, "learning_rate": 3.495603286044543e-06, "loss": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 128.1953125, "completions/mean_terminated_length": 126.16667175292969, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12038614973425865, "epoch": 0.0716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495570427405991e-06, "loss": 0.0, "num_tokens": 38118886.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1167767196893692, "epoch": 0.07168, "grad_norm": 0.0, "learning_rate": 3.4955374465959448e-06, "loss": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 118.8828125, "completions/mean_terminated_length": 118.8828125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1047329418361187, "epoch": 0.07176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4955043436167145e-06, "loss": 0.0, "num_tokens": 38199639.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1014433354139328, "epoch": 0.07184, "grad_norm": 0.0, "learning_rate": 3.495471118470616e-06, "loss": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 123.7578125, "completions/mean_terminated_length": 122.71653747558594, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.10525763034820557, "epoch": 0.07192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495437771159975e-06, "loss": 0.0, "num_tokens": 38281016.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09820349141955376, "epoch": 0.072, "grad_norm": 0.0, "learning_rate": 3.495404301687125e-06, "loss": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 110.3046875, "completions/mean_terminated_length": 110.3046875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.10432369634509087, "epoch": 0.07208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4953707100544086e-06, "loss": 0.0, "num_tokens": 38360671.0, "reward": 0.4668777883052826, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4668777883052826, "rewards/reward_fn/std": 0.9749124646186829, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10713974386453629, "epoch": 0.07216, "grad_norm": 0.0, "learning_rate": 3.4953369962641775e-06, "loss": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 114.703125, "completions/mean_terminated_length": 114.703125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.10487344861030579, "epoch": 0.07224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495303160318791e-06, "loss": 0.0, "num_tokens": 38440889.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10599946603178978, "epoch": 0.07232, "grad_norm": 0.0, "learning_rate": 3.495269202220617e-06, "loss": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 121.640625, "completions/mean_terminated_length": 121.640625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.09306107088923454, "epoch": 0.0724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4952351219720324e-06, "loss": 0.0, "num_tokens": 38521995.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0959010124206543, "epoch": 0.07248, "grad_norm": 0.0, "learning_rate": 3.495200919575422e-06, "loss": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 116.9453125, "completions/mean_terminated_length": 116.9453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.10558274388313293, "epoch": 0.07256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495166595033181e-06, "loss": 0.0, "num_tokens": 38602500.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10653937235474586, "epoch": 0.07264, "grad_norm": 0.0, "learning_rate": 3.49513214834771e-06, "loss": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 107.71875, "completions/mean_terminated_length": 107.71875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11845294013619423, "epoch": 0.07272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495097579521421e-06, "loss": 0.0, "num_tokens": 38681824.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12370914593338966, "epoch": 0.0728, "grad_norm": 0.0, "learning_rate": 3.495062888556733e-06, "loss": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 106.3046875, "completions/mean_terminated_length": 105.12598419189453, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.11030516028404236, "epoch": 0.07288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.495028075456074e-06, "loss": 0.0, "num_tokens": 38760967.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11295373365283012, "epoch": 0.07296, "grad_norm": 0.0, "learning_rate": 3.4949931402218812e-06, "loss": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 110.2265625, "completions/mean_terminated_length": 110.2265625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1143774688243866, "epoch": 0.07304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4949580828565987e-06, "loss": 0.0, "num_tokens": 38840612.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11152348667383194, "epoch": 0.07312, "grad_norm": 0.0, "learning_rate": 3.4949229033626813e-06, "loss": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 113.421875, "completions/mean_terminated_length": 113.421875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1115996278822422, "epoch": 0.0732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49488760174259e-06, "loss": 0.0, "num_tokens": 38920666.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11362941563129425, "epoch": 0.07328, "grad_norm": 0.0, "learning_rate": 3.494852177998796e-06, "loss": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 103.46875, "completions/mean_terminated_length": 99.8080062866211, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.12993238121271133, "epoch": 0.07336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494816632133779e-06, "loss": 0.0, "num_tokens": 38999446.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11700328812003136, "epoch": 0.07344, "grad_norm": 0.0, "learning_rate": 3.4947809641500264e-06, "loss": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 115.2578125, "completions/mean_terminated_length": 115.2578125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.11547225713729858, "epoch": 0.07352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4947451740500346e-06, "loss": 0.0, "num_tokens": 39079735.0, "reward": 0.4936048090457916, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4936048090457916, "rewards/reward_fn/std": 1.0008580684661865, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11519291624426842, "epoch": 0.0736, "grad_norm": 0.0, "learning_rate": 3.494709261836308e-06, "loss": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 111.828125, "completions/mean_terminated_length": 111.828125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.11120706796646118, "epoch": 0.07368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494673227511362e-06, "loss": 0.0, "num_tokens": 39159585.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10247380658984184, "epoch": 0.07376, "grad_norm": 0.0, "learning_rate": 3.4946370710777164e-06, "loss": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 104.2109375, "completions/mean_terminated_length": 103.0157470703125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.11801145225763321, "epoch": 0.07384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494600792537903e-06, "loss": 0.0, "num_tokens": 39238460.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12093865126371384, "epoch": 0.07392, "grad_norm": 0.0, "learning_rate": 3.49456439189446e-06, "loss": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 117.2109375, "completions/mean_terminated_length": 117.2109375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.11385158076882362, "epoch": 0.074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494527869149936e-06, "loss": 0.0, "num_tokens": 39318999.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11074972525238991, "epoch": 0.07408, "grad_norm": 0.0, "learning_rate": 3.4944912243068873e-06, "loss": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 127.4609375, "completions/mean_terminated_length": 122.23577117919922, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1213652491569519, "epoch": 0.07416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494454457367878e-06, "loss": 0.0, "num_tokens": 39400850.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12220953032374382, "epoch": 0.07424, "grad_norm": 0.0, "learning_rate": 3.4944175683354815e-06, "loss": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 96.625, "completions/mean_terminated_length": 96.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.11987914890050888, "epoch": 0.07432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4943805572122796e-06, "loss": 0.0, "num_tokens": 39478754.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1194237694144249, "epoch": 0.0744, "grad_norm": 0.0, "learning_rate": 3.4943434240008633e-06, "loss": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 113.6640625, "completions/mean_terminated_length": 113.6640625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1314888298511505, "epoch": 0.07448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4943061687038307e-06, "loss": 0.0, "num_tokens": 39558839.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12523899227380753, "epoch": 0.07456, "grad_norm": 0.0, "learning_rate": 3.4942687913237895e-06, "loss": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 108.28125, "completions/mean_terminated_length": 104.73600769042969, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.11255611851811409, "epoch": 0.07464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494231291863356e-06, "loss": 0.0, "num_tokens": 39638235.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.119602520018816, "epoch": 0.07472, "grad_norm": 0.0, "learning_rate": 3.494193670325155e-06, "loss": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 108.0546875, "completions/mean_terminated_length": 108.0546875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.13908684253692627, "epoch": 0.0748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494155926711819e-06, "loss": 0.0, "num_tokens": 39717602.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12754419445991516, "epoch": 0.07488, "grad_norm": 0.0, "learning_rate": 3.4941180610259894e-06, "loss": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 118.0703125, "completions/mean_terminated_length": 114.76000213623047, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10744817927479744, "epoch": 0.07496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.494080073270317e-06, "loss": 0.0, "num_tokens": 39798251.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1049797460436821, "epoch": 0.07504, "grad_norm": 0.0, "learning_rate": 3.494041963447461e-06, "loss": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 112.515625, "completions/mean_terminated_length": 112.515625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.11456095799803734, "epoch": 0.07512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4940037315600883e-06, "loss": 0.0, "num_tokens": 39878189.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11432864144444466, "epoch": 0.0752, "grad_norm": 0.0, "learning_rate": 3.493965377610874e-06, "loss": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 110.421875, "completions/mean_terminated_length": 110.421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.10626311227679253, "epoch": 0.07528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.493926901602503e-06, "loss": 0.0, "num_tokens": 39957859.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10215441137552261, "epoch": 0.07536, "grad_norm": 0.0, "learning_rate": 3.4938883035376683e-06, "loss": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 113.84375, "completions/mean_terminated_length": 112.72441101074219, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1127939261496067, "epoch": 0.07544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4938495834190715e-06, "loss": 0.0, "num_tokens": 40037967.0, "reward": 0.8330045938491821, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8330045938491821, "rewards/reward_fn/std": 1.2743265628814697, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1078719049692154, "epoch": 0.07552, "grad_norm": 0.0, "learning_rate": 3.493810741249422e-06, "loss": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 116.3671875, "completions/mean_terminated_length": 115.26771545410156, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.12590820342302322, "epoch": 0.0756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4937717770314392e-06, "loss": 0.0, "num_tokens": 40118398.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12543027475476265, "epoch": 0.07568, "grad_norm": 0.0, "learning_rate": 3.4937326907678497e-06, "loss": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 111.515625, "completions/mean_terminated_length": 110.3779525756836, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.10422205552458763, "epoch": 0.07576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.493693482461389e-06, "loss": 0.0, "num_tokens": 40198208.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10358374193310738, "epoch": 0.07584, "grad_norm": 0.0, "learning_rate": 3.4936541521148016e-06, "loss": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 121.96875, "completions/mean_terminated_length": 121.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1151740774512291, "epoch": 0.07592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4936146997308395e-06, "loss": 0.0, "num_tokens": 40279356.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11284787580370903, "epoch": 0.076, "grad_norm": 0.0, "learning_rate": 3.493575125312265e-06, "loss": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 111.28125, "completions/mean_terminated_length": 108.984130859375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11310816183686256, "epoch": 0.07608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.493535428861847e-06, "loss": 0.0, "num_tokens": 40359136.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11737369745969772, "epoch": 0.07616, "grad_norm": 0.0, "learning_rate": 3.4934956103823647e-06, "loss": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 106.8125, "completions/mean_terminated_length": 106.8125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.09983619675040245, "epoch": 0.07624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.493455669876604e-06, "loss": 0.0, "num_tokens": 40438344.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09863227978348732, "epoch": 0.07632, "grad_norm": 0.0, "learning_rate": 3.4934156073473604e-06, "loss": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 113.0390625, "completions/mean_terminated_length": 113.0390625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.10637344419956207, "epoch": 0.0764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.493375422797439e-06, "loss": 0.0, "num_tokens": 40518349.0, "reward": 0.017386555671691895, "reward_std": 0.0, "rewards/reward_fn/mean": 0.017386555671691895, "rewards/reward_fn/std": 0.04618125036358833, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1069735512137413, "epoch": 0.07648, "grad_norm": 0.0, "learning_rate": 3.493335116229651e-06, "loss": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 116.7734375, "completions/mean_terminated_length": 113.4320068359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.12296538054943085, "epoch": 0.07656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4932946876468185e-06, "loss": 0.0, "num_tokens": 40598832.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1287686564028263, "epoch": 0.07664, "grad_norm": 0.0, "learning_rate": 3.4932541370517703e-06, "loss": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 120.8203125, "completions/mean_terminated_length": 120.8203125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11148775741457939, "epoch": 0.07672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4932134644473444e-06, "loss": 0.0, "num_tokens": 40679833.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1040053516626358, "epoch": 0.0768, "grad_norm": 0.0, "learning_rate": 3.493172669836388e-06, "loss": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 118.28125, "completions/mean_terminated_length": 118.28125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.1164545938372612, "epoch": 0.07688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4931317532217566e-06, "loss": 0.0, "num_tokens": 40760509.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12174780666828156, "epoch": 0.07696, "grad_norm": 0.0, "learning_rate": 3.493090714606313e-06, "loss": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 109.4765625, "completions/mean_terminated_length": 107.15080261230469, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.12320443615317345, "epoch": 0.07704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4930495539929296e-06, "loss": 0.0, "num_tokens": 40840058.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1230475977063179, "epoch": 0.07712, "grad_norm": 0.0, "learning_rate": 3.493008271384488e-06, "loss": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 125.46875, "completions/mean_terminated_length": 121.25806427001953, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.11159167438745499, "epoch": 0.0772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492966866783877e-06, "loss": 0.0, "num_tokens": 40921654.0, "reward": 0.09413323551416397, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09413323551416397, "rewards/reward_fn/std": 0.2500317096710205, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1202332079410553, "epoch": 0.07728, "grad_norm": 0.0, "learning_rate": 3.492925340193994e-06, "loss": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 114.984375, "completions/mean_terminated_length": 114.984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.11936155706644058, "epoch": 0.07736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4928836916177465e-06, "loss": 0.0, "num_tokens": 41001908.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11852394416928291, "epoch": 0.07744, "grad_norm": 0.0, "learning_rate": 3.492841921058049e-06, "loss": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 121.0390625, "completions/mean_terminated_length": 121.0390625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.10435060784220695, "epoch": 0.07752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4928000285178246e-06, "loss": 0.0, "num_tokens": 41082937.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09598434716463089, "epoch": 0.0776, "grad_norm": 0.0, "learning_rate": 3.4927580140000057e-06, "loss": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 111.5859375, "completions/mean_terminated_length": 111.5859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.11710521206259727, "epoch": 0.07768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4927158775075327e-06, "loss": 0.0, "num_tokens": 41162756.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11356746405363083, "epoch": 0.07776, "grad_norm": 0.0, "learning_rate": 3.492673619043355e-06, "loss": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 126.5546875, "completions/mean_terminated_length": 125.53543090820312, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.11078286543488503, "epoch": 0.07784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4926312386104302e-06, "loss": 0.0, "num_tokens": 41244491.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11673123389482498, "epoch": 0.07792, "grad_norm": 0.0, "learning_rate": 3.492588736211724e-06, "loss": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 90.109375, "completions/mean_terminated_length": 90.109375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.14052622765302658, "epoch": 0.078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492546111850212e-06, "loss": 0.0, "num_tokens": 41321561.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10503193736076355, "epoch": 0.07808, "grad_norm": 0.0, "learning_rate": 3.4925033655288766e-06, "loss": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 124.796875, "completions/mean_terminated_length": 107.38053131103516, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.12043849378824234, "epoch": 0.07816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.49246049725071e-06, "loss": 0.0, "num_tokens": 41403071.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11291173473000526, "epoch": 0.07824, "grad_norm": 0.0, "learning_rate": 3.4924175070187125e-06, "loss": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 117.6015625, "completions/mean_terminated_length": 116.51181030273438, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.11430008709430695, "epoch": 0.07832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492374394835893e-06, "loss": 0.0, "num_tokens": 41483660.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10698139667510986, "epoch": 0.0784, "grad_norm": 0.0, "learning_rate": 3.4923311607052684e-06, "loss": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 102.7109375, "completions/mean_terminated_length": 102.7109375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 0.12594813108444214, "epoch": 0.07848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492287804629865e-06, "loss": 0.0, "num_tokens": 41562343.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12769698351621628, "epoch": 0.07856, "grad_norm": 0.0, "learning_rate": 3.4922443266127177e-06, "loss": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 109.2734375, "completions/mean_terminated_length": 109.2734375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.10990018025040627, "epoch": 0.07864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.492200726656869e-06, "loss": 0.0, "num_tokens": 41641866.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10762454941868782, "epoch": 0.07872, "grad_norm": 0.0, "learning_rate": 3.492157004765371e-06, "loss": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 115.203125, "completions/mean_terminated_length": 115.203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.11654048040509224, "epoch": 0.0788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4921131609412826e-06, "loss": 0.0, "num_tokens": 41722148.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11436732858419418, "epoch": 0.07888, "grad_norm": 0.0, "learning_rate": 3.4920691951876734e-06, "loss": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 112.6640625, "completions/mean_terminated_length": 112.6640625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.10474212095141411, "epoch": 0.07896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4920251075076204e-06, "loss": 0.0, "num_tokens": 41802105.0, "reward": 0.4471285343170166, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4471285343170166, "rewards/reward_fn/std": 0.9843262434005737, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10226741805672646, "epoch": 0.07904, "grad_norm": 0.0, "learning_rate": 3.491980897904209e-06, "loss": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 113.515625, "completions/mean_terminated_length": 113.515625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.11041431128978729, "epoch": 0.07912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4919365663805336e-06, "loss": 0.0, "num_tokens": 41882171.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11354997009038925, "epoch": 0.0792, "grad_norm": 0.0, "learning_rate": 3.491892112939697e-06, "loss": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 114.3828125, "completions/mean_terminated_length": 113.26771545410156, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1138971820473671, "epoch": 0.07928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4918475375848103e-06, "loss": 0.0, "num_tokens": 41962348.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.116082102060318, "epoch": 0.07936, "grad_norm": 0.0, "learning_rate": 3.4918028403189933e-06, "loss": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 122.53125, "completions/mean_terminated_length": 121.48031616210938, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.12211049348115921, "epoch": 0.07944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4917580211453744e-06, "loss": 0.0, "num_tokens": 42043568.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11420934647321701, "epoch": 0.07952, "grad_norm": 0.0, "learning_rate": 3.49171308006709e-06, "loss": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 137.7734375, "completions/mean_terminated_length": 137.7734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.11056862026453018, "epoch": 0.0796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4916680170872866e-06, "loss": 0.0, "num_tokens": 42126739.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10541099309921265, "epoch": 0.07968, "grad_norm": 0.0, "learning_rate": 3.491622832209117e-06, "loss": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 109.28125, "completions/mean_terminated_length": 109.28125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.09699581936001778, "epoch": 0.07976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.491577525435745e-06, "loss": 0.0, "num_tokens": 42206263.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0970776341855526, "epoch": 0.07984, "grad_norm": 0.0, "learning_rate": 3.49153209677034e-06, "loss": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 101.265625, "completions/mean_terminated_length": 101.265625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.09855754300951958, "epoch": 0.07992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4914865462160824e-06, "loss": 0.0, "num_tokens": 42284761.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10992668196558952, "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 3.49144087377616e-06, "loss": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 116.609375, "completions/mean_terminated_length": 108.54544830322266, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.12012515962123871, "epoch": 0.08008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4913950794537697e-06, "loss": 0.0, "num_tokens": 42365223.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11680605262517929, "epoch": 0.08016, "grad_norm": 0.0, "learning_rate": 3.4913491632521165e-06, "loss": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 106.8671875, "completions/mean_terminated_length": 106.8671875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.11918551474809647, "epoch": 0.08024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4913031251744144e-06, "loss": 0.0, "num_tokens": 42444438.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11728568002581596, "epoch": 0.08032, "grad_norm": 0.0, "learning_rate": 3.4912569652238844e-06, "loss": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 130.4609375, "completions/mean_terminated_length": 129.47244262695312, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.08552344888448715, "epoch": 0.0804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4912106834037586e-06, "loss": 0.0, "num_tokens": 42526673.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09525338560342789, "epoch": 0.08048, "grad_norm": 0.0, "learning_rate": 3.4911642797172754e-06, "loss": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 125.71875, "completions/mean_terminated_length": 125.71875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11032634600996971, "epoch": 0.08056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4911177541676826e-06, "loss": 0.0, "num_tokens": 42608301.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11419868469238281, "epoch": 0.08064, "grad_norm": 0.0, "learning_rate": 3.4910711067582367e-06, "loss": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 117.3046875, "completions/mean_terminated_length": 116.21260070800781, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11854861676692963, "epoch": 0.08072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4910243374922026e-06, "loss": 0.0, "num_tokens": 42688852.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10978639498353004, "epoch": 0.0808, "grad_norm": 0.0, "learning_rate": 3.4909774463728536e-06, "loss": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 106.1796875, "completions/mean_terminated_length": 106.1796875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.11698530986905098, "epoch": 0.08088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.490930433403472e-06, "loss": 0.0, "num_tokens": 42767979.0, "reward": 0.08830241858959198, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08830241858959198, "rewards/reward_fn/std": 0.23454421758651733, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10787990689277649, "epoch": 0.08096, "grad_norm": 0.0, "learning_rate": 3.490883298587347e-06, "loss": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 131.859375, "completions/mean_terminated_length": 131.859375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.11230366304516792, "epoch": 0.08104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4908360419277788e-06, "loss": 0.0, "num_tokens": 42850393.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11421741172671318, "epoch": 0.08112, "grad_norm": 0.0, "learning_rate": 3.490788663428074e-06, "loss": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 103.2265625, "completions/mean_terminated_length": 103.2265625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.12278367951512337, "epoch": 0.0812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4907411630915492e-06, "loss": 0.0, "num_tokens": 42929142.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12030551955103874, "epoch": 0.08128, "grad_norm": 0.0, "learning_rate": 3.4906935409215284e-06, "loss": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 102.09375, "completions/mean_terminated_length": 102.09375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.11385096982121468, "epoch": 0.08136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4906457969213455e-06, "loss": 0.0, "num_tokens": 43007746.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13025544956326485, "epoch": 0.08144, "grad_norm": 0.0, "learning_rate": 3.490597931094341e-06, "loss": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 122.9296875, "completions/mean_terminated_length": 122.9296875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.10184834524989128, "epoch": 0.08152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.490549943443866e-06, "loss": 0.0, "num_tokens": 43089017.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10103318467736244, "epoch": 0.0816, "grad_norm": 0.0, "learning_rate": 3.490501833973278e-06, "loss": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 106.921875, "completions/mean_terminated_length": 106.921875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.10422886162996292, "epoch": 0.08168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4904536026859454e-06, "loss": 0.0, "num_tokens": 43168239.0, "reward": 0.7698310613632202, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7698310613632202, "rewards/reward_fn/std": 1.293669581413269, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10234562680125237, "epoch": 0.08176, "grad_norm": 0.0, "learning_rate": 3.490405249585243e-06, "loss": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 116.5390625, "completions/mean_terminated_length": 116.5390625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.10615739971399307, "epoch": 0.08184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.490356774674555e-06, "loss": 0.0, "num_tokens": 43248692.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0979994386434555, "epoch": 0.08192, "grad_norm": 0.0, "learning_rate": 3.490308177957275e-06, "loss": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 126.4765625, "completions/mean_terminated_length": 126.4765625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.10266315191984177, "epoch": 0.082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4902594594368037e-06, "loss": 0.0, "num_tokens": 43330417.0, "reward": 0.07711366564035416, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07711366564035416, "rewards/reward_fn/std": 0.2048252522945404, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1001880019903183, "epoch": 0.08208, "grad_norm": 0.0, "learning_rate": 3.49021061911655e-06, "loss": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 110.2578125, "completions/mean_terminated_length": 107.94445037841797, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10865392908453941, "epoch": 0.08216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4901616569999337e-06, "loss": 0.0, "num_tokens": 43410066.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11438634246587753, "epoch": 0.08224, "grad_norm": 0.0, "learning_rate": 3.4901125730903806e-06, "loss": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 118.5625, "completions/mean_terminated_length": 118.5625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.10082719847559929, "epoch": 0.08232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.490063367391327e-06, "loss": 0.0, "num_tokens": 43490778.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10184156522154808, "epoch": 0.0824, "grad_norm": 0.0, "learning_rate": 3.4900140399062162e-06, "loss": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 109.2578125, "completions/mean_terminated_length": 108.10236358642578, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.1393626183271408, "epoch": 0.08248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4899645906385e-06, "loss": 0.0, "num_tokens": 43570299.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1415623500943184, "epoch": 0.08256, "grad_norm": 0.0, "learning_rate": 3.4899150195916404e-06, "loss": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1017913892865181, "epoch": 0.08264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.489865326769107e-06, "loss": 0.0, "num_tokens": 43651067.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09752736613154411, "epoch": 0.08272, "grad_norm": 0.0, "learning_rate": 3.4898155121743757e-06, "loss": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 116.3359375, "completions/mean_terminated_length": 116.3359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.09935544058680534, "epoch": 0.0828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4897655758109356e-06, "loss": 0.0, "num_tokens": 43731494.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10217775404453278, "epoch": 0.08288, "grad_norm": 0.0, "learning_rate": 3.4897155176822805e-06, "loss": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 138.6484375, "completions/mean_terminated_length": 115.6168212890625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.11988688260316849, "epoch": 0.08296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4896653377919137e-06, "loss": 0.0, "num_tokens": 43814777.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10940118879079819, "epoch": 0.08304, "grad_norm": 0.0, "learning_rate": 3.4896150361433475e-06, "loss": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 120.5859375, "completions/mean_terminated_length": 119.51968383789062, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.09444631636142731, "epoch": 0.08312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.489564612740103e-06, "loss": 0.0, "num_tokens": 43895748.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10089948773384094, "epoch": 0.0832, "grad_norm": 0.0, "learning_rate": 3.4895140675857086e-06, "loss": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 123.09375, "completions/mean_terminated_length": 123.09375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.11235075816512108, "epoch": 0.08328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4894634006837026e-06, "loss": 0.0, "num_tokens": 43977040.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11117249727249146, "epoch": 0.08336, "grad_norm": 0.0, "learning_rate": 3.48941261203763e-06, "loss": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 121.1015625, "completions/mean_terminated_length": 120.03936767578125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.11212780699133873, "epoch": 0.08344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4893617016510463e-06, "loss": 0.0, "num_tokens": 44058077.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11315042525529861, "epoch": 0.08352, "grad_norm": 0.0, "learning_rate": 3.4893106695275154e-06, "loss": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 113.8515625, "completions/mean_terminated_length": 113.8515625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.10165378823876381, "epoch": 0.0836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4892595156706083e-06, "loss": 0.0, "num_tokens": 44138186.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10402564331889153, "epoch": 0.08368, "grad_norm": 0.0, "learning_rate": 3.489208240083904e-06, "loss": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 118.78400421142578, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.11564111337065697, "epoch": 0.08376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.489156842770993e-06, "loss": 0.0, "num_tokens": 44219338.0, "reward": 0.12482782453298569, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12482782453298569, "rewards/reward_fn/std": 0.3315610885620117, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11469100043177605, "epoch": 0.08384, "grad_norm": 0.0, "learning_rate": 3.489105323735472e-06, "loss": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 124.1484375, "completions/mean_terminated_length": 120.9840087890625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.11334914714097977, "epoch": 0.08392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.489053682980947e-06, "loss": 0.0, "num_tokens": 44300765.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.118477463722229, "epoch": 0.084, "grad_norm": 0.0, "learning_rate": 3.489001920511032e-06, "loss": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 109.46875, "completions/mean_terminated_length": 109.46875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.11652344465255737, "epoch": 0.08408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4889500363293497e-06, "loss": 0.0, "num_tokens": 44380313.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11338522285223007, "epoch": 0.08416, "grad_norm": 0.0, "learning_rate": 3.4888980304395318e-06, "loss": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 108.8203125, "completions/mean_terminated_length": 108.8203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.12072482705116272, "epoch": 0.08424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.488845902845218e-06, "loss": 0.0, "num_tokens": 44459778.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12914977595210075, "epoch": 0.08432, "grad_norm": 0.0, "learning_rate": 3.4887936535500565e-06, "loss": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 120.0546875, "completions/mean_terminated_length": 118.9842529296875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.11102195829153061, "epoch": 0.0844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.488741282557704e-06, "loss": 0.0, "num_tokens": 44540681.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11464093625545502, "epoch": 0.08448, "grad_norm": 0.0, "learning_rate": 3.488688789871827e-06, "loss": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 101.0546875, "completions/mean_terminated_length": 101.0546875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.13356781005859375, "epoch": 0.08456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4886361754960987e-06, "loss": 0.0, "num_tokens": 44619152.0, "reward": 0.4926808476448059, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4926808476448059, "rewards/reward_fn/std": 0.9958887100219727, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1328003816306591, "epoch": 0.08464, "grad_norm": 0.0, "learning_rate": 3.4885834394342015e-06, "loss": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 132.6953125, "completions/mean_terminated_length": 128.71774291992188, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.11726906150579453, "epoch": 0.08472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.488530581689826e-06, "loss": 0.0, "num_tokens": 44701673.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11254876479506493, "epoch": 0.0848, "grad_norm": 0.0, "learning_rate": 3.488477602266673e-06, "loss": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 108.65625, "completions/mean_terminated_length": 108.65625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.11307531967759132, "epoch": 0.08488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4884245011684488e-06, "loss": 0.0, "num_tokens": 44781117.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11858313530683517, "epoch": 0.08496, "grad_norm": 0.0, "learning_rate": 3.488371278398871e-06, "loss": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 128.3359375, "completions/mean_terminated_length": 126.30953216552734, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.11847035214304924, "epoch": 0.08504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4883179339616643e-06, "loss": 0.0, "num_tokens": 44863080.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11374210566282272, "epoch": 0.08512, "grad_norm": 0.0, "learning_rate": 3.4882644678605627e-06, "loss": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 110.6328125, "completions/mean_terminated_length": 110.6328125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.1124238483607769, "epoch": 0.0852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.488210880099308e-06, "loss": 0.0, "num_tokens": 44942777.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10681337490677834, "epoch": 0.08528, "grad_norm": 0.0, "learning_rate": 3.48815717068165e-06, "loss": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1164996288716793, "epoch": 0.08536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4881033396113486e-06, "loss": 0.0, "num_tokens": 45025265.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11347607150673866, "epoch": 0.08544, "grad_norm": 0.0, "learning_rate": 3.4880493868921714e-06, "loss": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 117.1953125, "completions/mean_terminated_length": 117.1953125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.1031079962849617, "epoch": 0.08552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4879953125278945e-06, "loss": 0.0, "num_tokens": 45105802.0, "reward": 0.01492841262370348, "reward_std": 0.0, "rewards/reward_fn/mean": 0.01492841262370348, "rewards/reward_fn/std": 0.039652060717344284, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10567205399274826, "epoch": 0.0856, "grad_norm": 0.0, "learning_rate": 3.4879411165223027e-06, "loss": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 117.5703125, "completions/mean_terminated_length": 117.5703125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.11332858726382256, "epoch": 0.08568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4878867988791887e-06, "loss": 0.0, "num_tokens": 45186387.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11422845721244812, "epoch": 0.08576, "grad_norm": 0.0, "learning_rate": 3.487832359602354e-06, "loss": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 114.1796875, "completions/mean_terminated_length": 114.1796875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.10254541411995888, "epoch": 0.08584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4877777986956093e-06, "loss": 0.0, "num_tokens": 45266538.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1051960326731205, "epoch": 0.08592, "grad_norm": 0.0, "learning_rate": 3.487723116162773e-06, "loss": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 117.6171875, "completions/mean_terminated_length": 117.6171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1072876863181591, "epoch": 0.086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.487668312007672e-06, "loss": 0.0, "num_tokens": 45347129.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10502145811915398, "epoch": 0.08608, "grad_norm": 0.0, "learning_rate": 3.487613386234143e-06, "loss": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 111.8515625, "completions/mean_terminated_length": 111.8515625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1314760409295559, "epoch": 0.08616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.48755833884603e-06, "loss": 0.0, "num_tokens": 45426982.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12134729325771332, "epoch": 0.08624, "grad_norm": 0.0, "learning_rate": 3.4875031698471846e-06, "loss": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 114.7109375, "completions/mean_terminated_length": 114.7109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.10539299249649048, "epoch": 0.08632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.487447879241469e-06, "loss": 0.0, "num_tokens": 45507201.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1022258847951889, "epoch": 0.0864, "grad_norm": 0.0, "learning_rate": 3.487392467032753e-06, "loss": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 128.984375, "completions/mean_terminated_length": 126.96826171875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11038390174508095, "epoch": 0.08648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.487336933224914e-06, "loss": 0.0, "num_tokens": 45589247.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11847275495529175, "epoch": 0.08656, "grad_norm": 0.0, "learning_rate": 3.4872812778218407e-06, "loss": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 110.53912353515625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.1488831415772438, "epoch": 0.08664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.487225500827426e-06, "loss": 0.0, "num_tokens": 45670823.0, "reward": 0.8383024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8383024334907532, "rewards/reward_fn/std": 1.2736961841583252, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14584719389677048, "epoch": 0.08672, "grad_norm": 0.0, "learning_rate": 3.4871696022455755e-06, "loss": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 110.265625, "completions/mean_terminated_length": 110.265625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.14044740796089172, "epoch": 0.0868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4871135820802004e-06, "loss": 0.0, "num_tokens": 45750473.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1285300999879837, "epoch": 0.08688, "grad_norm": 0.0, "learning_rate": 3.4870574403352216e-06, "loss": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 110.7734375, "completions/mean_terminated_length": 110.7734375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.10478761047124863, "epoch": 0.08696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4870011770145696e-06, "loss": 0.0, "num_tokens": 45830188.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09817630052566528, "epoch": 0.08704, "grad_norm": 0.0, "learning_rate": 3.4869447921221806e-06, "loss": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 99.65625, "completions/mean_terminated_length": 98.4251937866211, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.11260134354233742, "epoch": 0.08712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.486888285662002e-06, "loss": 0.0, "num_tokens": 45908480.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11332425847649574, "epoch": 0.0872, "grad_norm": 0.0, "learning_rate": 3.4868316576379888e-06, "loss": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 256.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 99.38983154296875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.12787066027522087, "epoch": 0.08728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4867749080541032e-06, "loss": 0.0, "num_tokens": 45988304.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12752417474985123, "epoch": 0.08736, "grad_norm": 0.0, "learning_rate": 3.486718036914319e-06, "loss": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 111.3359375, "completions/mean_terminated_length": 110.19685363769531, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10347190871834755, "epoch": 0.08744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4866610442226144e-06, "loss": 0.0, "num_tokens": 46068091.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10675002634525299, "epoch": 0.08752, "grad_norm": 0.0, "learning_rate": 3.48660392998298e-06, "loss": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 96.234375, "completions/mean_terminated_length": 96.234375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.11562159657478333, "epoch": 0.0876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.486546694199412e-06, "loss": 0.0, "num_tokens": 46145945.0, "reward": 0.8445001840591431, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8445001840591431, "rewards/reward_fn/std": 1.2646361589431763, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12344890460371971, "epoch": 0.08768, "grad_norm": 0.0, "learning_rate": 3.486489336875917e-06, "loss": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 107.1875, "completions/mean_terminated_length": 107.1875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.11971041187644005, "epoch": 0.08776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.486431858016509e-06, "loss": 0.0, "num_tokens": 46225201.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11745235696434975, "epoch": 0.08784, "grad_norm": 0.0, "learning_rate": 3.486374257625212e-06, "loss": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 101.29032135009766, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.12557422369718552, "epoch": 0.08792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4863165357060557e-06, "loss": 0.0, "num_tokens": 46304321.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11064497381448746, "epoch": 0.088, "grad_norm": 0.0, "learning_rate": 3.4862586922630813e-06, "loss": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 114.1015625, "completions/mean_terminated_length": 114.1015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.10798955336213112, "epoch": 0.08808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4862007273003362e-06, "loss": 0.0, "num_tokens": 46384462.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10714859887957573, "epoch": 0.08816, "grad_norm": 0.0, "learning_rate": 3.4861426408218782e-06, "loss": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 98.3515625, "completions/mean_terminated_length": 98.3515625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.11187005788087845, "epoch": 0.08824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.486084432831773e-06, "loss": 0.0, "num_tokens": 46462587.0, "reward": 0.17610588669776917, "reward_std": 0.0, "rewards/reward_fn/mean": 0.17610588669776917, "rewards/reward_fn/std": 0.29492589831352234, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11490857973694801, "epoch": 0.08832, "grad_norm": 0.0, "learning_rate": 3.486026103334093e-06, "loss": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 106.7421875, "completions/mean_terminated_length": 106.7421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10729356855154037, "epoch": 0.0884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4859676523329226e-06, "loss": 0.0, "num_tokens": 46541786.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10766102746129036, "epoch": 0.08848, "grad_norm": 0.0, "learning_rate": 3.4859090798323517e-06, "loss": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 127.2890625, "completions/mean_terminated_length": 127.2890625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1049608513712883, "epoch": 0.08856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.485850385836479e-06, "loss": 0.0, "num_tokens": 46623615.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10440359264612198, "epoch": 0.08864, "grad_norm": 0.0, "learning_rate": 3.4857915703494145e-06, "loss": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 119.6220474243164, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11451771855354309, "epoch": 0.08872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4857326333752725e-06, "loss": 0.0, "num_tokens": 46704599.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11739762127399445, "epoch": 0.0888, "grad_norm": 0.0, "learning_rate": 3.4856735749181795e-06, "loss": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 88.4921875, "completions/mean_terminated_length": 88.4921875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.14355996251106262, "epoch": 0.08888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4856143949822686e-06, "loss": 0.0, "num_tokens": 46781462.0, "reward": 1.5124585628509521, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5124585628509521, "rewards/reward_fn/std": 1.493700623512268, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14112048596143723, "epoch": 0.08896, "grad_norm": 0.0, "learning_rate": 3.4855550935716814e-06, "loss": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 116.6328125, "completions/mean_terminated_length": 116.6328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.11109301447868347, "epoch": 0.08904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4854956706905687e-06, "loss": 0.0, "num_tokens": 46861927.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12606117874383926, "epoch": 0.08912, "grad_norm": 0.0, "learning_rate": 3.485436126343089e-06, "loss": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 120.44445037841797, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.11635610461235046, "epoch": 0.0892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.485376460533411e-06, "loss": 0.0, "num_tokens": 46943151.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1098250262439251, "epoch": 0.08928, "grad_norm": 0.0, "learning_rate": 3.4853166732657084e-06, "loss": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 105.8984375, "completions/mean_terminated_length": 104.71653747558594, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.11674690246582031, "epoch": 0.08936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.485256764544168e-06, "loss": 0.0, "num_tokens": 47022242.0, "reward": 0.8748958110809326, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8748958110809326, "rewards/reward_fn/std": 1.248794436454773, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12692268565297127, "epoch": 0.08944, "grad_norm": 0.0, "learning_rate": 3.4851967343729815e-06, "loss": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 110.5625, "completions/mean_terminated_length": 109.41732025146484, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.11504000797867775, "epoch": 0.08952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.485136582756351e-06, "loss": 0.0, "num_tokens": 47101930.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12440225481987, "epoch": 0.0896, "grad_norm": 0.0, "learning_rate": 3.485076309698486e-06, "loss": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 120.15625, "completions/mean_terminated_length": 119.08661651611328, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.11199469491839409, "epoch": 0.08968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.485015915203606e-06, "loss": 0.0, "num_tokens": 47182846.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11690477654337883, "epoch": 0.08976, "grad_norm": 0.0, "learning_rate": 3.484955399275936e-06, "loss": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 109.7109375, "completions/mean_terminated_length": 109.7109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.0950896255671978, "epoch": 0.08984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4848947619197133e-06, "loss": 0.0, "num_tokens": 47262425.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09127096459269524, "epoch": 0.08992, "grad_norm": 0.0, "learning_rate": 3.484834003139181e-06, "loss": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 117.84375, "completions/mean_terminated_length": 117.84375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.10797977074980736, "epoch": 0.09, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.484773122938592e-06, "loss": 0.0, "num_tokens": 47343045.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09952344000339508, "epoch": 0.09008, "grad_norm": 0.0, "learning_rate": 3.4847121213222063e-06, "loss": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 112.3125, "completions/mean_terminated_length": 112.3125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.09821468964219093, "epoch": 0.09016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.484650998294295e-06, "loss": 0.0, "num_tokens": 47422957.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09645706415176392, "epoch": 0.09024, "grad_norm": 0.0, "learning_rate": 3.4845897538591346e-06, "loss": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 112.0703125, "completions/mean_terminated_length": 110.93700408935547, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.11594261229038239, "epoch": 0.09032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4845283880210127e-06, "loss": 0.0, "num_tokens": 47502838.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12312203645706177, "epoch": 0.0904, "grad_norm": 0.0, "learning_rate": 3.484466900784224e-06, "loss": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 126.7734375, "completions/mean_terminated_length": 126.7734375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.09998257458209991, "epoch": 0.09048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.484405292153071e-06, "loss": 0.0, "num_tokens": 47584601.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09835709258913994, "epoch": 0.09056, "grad_norm": 0.0, "learning_rate": 3.4843435621318666e-06, "loss": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 91.7421875, "completions/mean_terminated_length": 91.7421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.10344229638576508, "epoch": 0.09064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.484281710724931e-06, "loss": 0.0, "num_tokens": 47661880.0, "reward": 0.7794369459152222, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7794369459152222, "rewards/reward_fn/std": 1.2893400192260742, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1004820205271244, "epoch": 0.09072, "grad_norm": 0.0, "learning_rate": 3.4842197379365927e-06, "loss": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 92.8125, "completions/mean_terminated_length": 92.8125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.1622350960969925, "epoch": 0.0908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.48415764377119e-06, "loss": 0.0, "num_tokens": 47739296.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13794133812189102, "epoch": 0.09088, "grad_norm": 0.0, "learning_rate": 3.484095428233068e-06, "loss": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 102.765625, "completions/mean_terminated_length": 102.765625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.10559768602252007, "epoch": 0.09096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.484033091326582e-06, "loss": 0.0, "num_tokens": 47817986.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10007914528250694, "epoch": 0.09104, "grad_norm": 0.0, "learning_rate": 3.483970633056094e-06, "loss": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 112.5703125, "completions/mean_terminated_length": 112.5703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.10734876990318298, "epoch": 0.09112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4839080534259764e-06, "loss": 0.0, "num_tokens": 47897931.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10985511541366577, "epoch": 0.0912, "grad_norm": 0.0, "learning_rate": 3.483845352440608e-06, "loss": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 128.6640625, "completions/mean_terminated_length": 127.66141510009766, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.10136925801634789, "epoch": 0.09128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4837825301043785e-06, "loss": 0.0, "num_tokens": 47979936.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10397794470191002, "epoch": 0.09136, "grad_norm": 0.0, "learning_rate": 3.483719586421684e-06, "loss": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 99.59375, "completions/mean_terminated_length": 99.59375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.11731519550085068, "epoch": 0.09144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4836565213969297e-06, "loss": 0.0, "num_tokens": 48058220.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11506820097565651, "epoch": 0.09152, "grad_norm": 0.0, "learning_rate": 3.4835933350345294e-06, "loss": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 127.609375, "completions/mean_terminated_length": 127.609375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.09890441969037056, "epoch": 0.0916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4835300273389067e-06, "loss": 0.0, "num_tokens": 48140090.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10452334955334663, "epoch": 0.09168, "grad_norm": 0.0, "learning_rate": 3.483466598314491e-06, "loss": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 113.078125, "completions/mean_terminated_length": 113.078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.10525070503354073, "epoch": 0.09176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4834030479657223e-06, "loss": 0.0, "num_tokens": 48220100.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10677464306354523, "epoch": 0.09184, "grad_norm": 0.0, "learning_rate": 3.4833393762970484e-06, "loss": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 99.109375, "completions/mean_terminated_length": 99.109375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.14368005841970444, "epoch": 0.09192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.483275583312926e-06, "loss": 0.0, "num_tokens": 48298322.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11617060378193855, "epoch": 0.092, "grad_norm": 0.0, "learning_rate": 3.4832116690178197e-06, "loss": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 118.8515625, "completions/mean_terminated_length": 118.8515625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.08775530382990837, "epoch": 0.09208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4831476334162024e-06, "loss": 0.0, "num_tokens": 48379071.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08808385953307152, "epoch": 0.09216, "grad_norm": 0.0, "learning_rate": 3.4830834765125565e-06, "loss": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 102.875, "completions/mean_terminated_length": 102.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.12761717289686203, "epoch": 0.09224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4830191983113718e-06, "loss": 0.0, "num_tokens": 48457775.0, "reward": 0.8077646493911743, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8077646493911743, "rewards/reward_fn/std": 1.2794528007507324, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12342264503240585, "epoch": 0.09232, "grad_norm": 0.0, "learning_rate": 3.4829547988171476e-06, "loss": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 117.359375, "completions/mean_terminated_length": 117.359375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.09679945930838585, "epoch": 0.0924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.482890278034391e-06, "loss": 0.0, "num_tokens": 48538333.0, "reward": 0.07393992692232132, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07393992692232132, "rewards/reward_fn/std": 0.19639533758163452, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10541965812444687, "epoch": 0.09248, "grad_norm": 0.0, "learning_rate": 3.4828256359676173e-06, "loss": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 102.359375, "completions/mean_terminated_length": 102.359375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1291700303554535, "epoch": 0.09256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4827608726213515e-06, "loss": 0.0, "num_tokens": 48616971.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13157011568546295, "epoch": 0.09264, "grad_norm": 0.0, "learning_rate": 3.4826959880001266e-06, "loss": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 115.515625, "completions/mean_terminated_length": 109.80487060546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.10463280603289604, "epoch": 0.09272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4826309821084823e-06, "loss": 0.0, "num_tokens": 48697293.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10278439149260521, "epoch": 0.0928, "grad_norm": 0.0, "learning_rate": 3.4825658549509697e-06, "loss": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 118.0625, "completions/mean_terminated_length": 116.97637939453125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.11327430978417397, "epoch": 0.09288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4825006065321463e-06, "loss": 0.0, "num_tokens": 48777941.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11635816842317581, "epoch": 0.09296, "grad_norm": 0.0, "learning_rate": 3.4824352368565792e-06, "loss": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 113.34375, "completions/mean_terminated_length": 113.34375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11096836254000664, "epoch": 0.09304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.482369745928844e-06, "loss": 0.0, "num_tokens": 48857985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10704779252409935, "epoch": 0.09312, "grad_norm": 0.0, "learning_rate": 3.4823041337535236e-06, "loss": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 117.9375, "completions/mean_terminated_length": 117.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.09473787620663643, "epoch": 0.0932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4822384003352106e-06, "loss": 0.0, "num_tokens": 48938617.0, "reward": 0.09303461015224457, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09303461015224457, "rewards/reward_fn/std": 0.24711361527442932, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10062115639448166, "epoch": 0.09328, "grad_norm": 0.0, "learning_rate": 3.482172545678505e-06, "loss": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 117.3125, "completions/mean_terminated_length": 116.22047424316406, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.11938484013080597, "epoch": 0.09336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4821065697880164e-06, "loss": 0.0, "num_tokens": 49019169.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1259092465043068, "epoch": 0.09344, "grad_norm": 0.0, "learning_rate": 3.482040472668363e-06, "loss": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 122.8203125, "completions/mean_terminated_length": 120.70635986328125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.1328589841723442, "epoch": 0.09352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4819742543241695e-06, "loss": 0.0, "num_tokens": 49100426.0, "reward": 0.3948310613632202, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3948310613632202, "rewards/reward_fn/std": 0.9899041056632996, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13508863002061844, "epoch": 0.0936, "grad_norm": 0.0, "learning_rate": 3.481907914760072e-06, "loss": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 114.7109375, "completions/mean_terminated_length": 114.7109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10530534759163857, "epoch": 0.09368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4818414539807127e-06, "loss": 0.0, "num_tokens": 49180645.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1012938879430294, "epoch": 0.09376, "grad_norm": 0.0, "learning_rate": 3.481774871990744e-06, "loss": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 102.9765625, "completions/mean_terminated_length": 102.9765625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1123138964176178, "epoch": 0.09384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4817081687948236e-06, "loss": 0.0, "num_tokens": 49259362.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11003769189119339, "epoch": 0.09392, "grad_norm": 0.0, "learning_rate": 3.481641344397623e-06, "loss": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 113.859375, "completions/mean_terminated_length": 112.74015808105469, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11303271353244781, "epoch": 0.094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4815743988038177e-06, "loss": 0.0, "num_tokens": 49339472.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1153682991862297, "epoch": 0.09408, "grad_norm": 0.0, "learning_rate": 3.4815073320180926e-06, "loss": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 115.546875, "completions/mean_terminated_length": 115.546875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.10895204171538353, "epoch": 0.09416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4814401440451432e-06, "loss": 0.0, "num_tokens": 49419798.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10981019213795662, "epoch": 0.09424, "grad_norm": 0.0, "learning_rate": 3.481372834889671e-06, "loss": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 113.5390625, "completions/mean_terminated_length": 106.53278350830078, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.14557038247585297, "epoch": 0.09432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4813054045563873e-06, "loss": 0.0, "num_tokens": 49499867.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14545613527297974, "epoch": 0.0944, "grad_norm": 0.0, "learning_rate": 3.481237853050011e-06, "loss": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 120.421875, "completions/mean_terminated_length": 120.421875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11519703641533852, "epoch": 0.09448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4811701803752704e-06, "loss": 0.0, "num_tokens": 49580817.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10953214764595032, "epoch": 0.09456, "grad_norm": 0.0, "learning_rate": 3.481102386536902e-06, "loss": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 118.953125, "completions/mean_terminated_length": 118.953125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.11021138727664948, "epoch": 0.09464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.48103447153965e-06, "loss": 0.0, "num_tokens": 49661579.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10955458134412766, "epoch": 0.09472, "grad_norm": 0.0, "learning_rate": 3.4809664353882686e-06, "loss": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 102.1640625, "completions/mean_terminated_length": 100.9527587890625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.11220676451921463, "epoch": 0.0948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4808982780875187e-06, "loss": 0.0, "num_tokens": 49740192.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11021895706653595, "epoch": 0.09488, "grad_norm": 0.0, "learning_rate": 3.4808299996421715e-06, "loss": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 125.7421875, "completions/mean_terminated_length": 125.7421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.10591598972678185, "epoch": 0.09496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.480761600057005e-06, "loss": 0.0, "num_tokens": 49821823.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10445421189069748, "epoch": 0.09504, "grad_norm": 0.0, "learning_rate": 3.4806930793368074e-06, "loss": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 106.9140625, "completions/mean_terminated_length": 106.9140625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1112387664616108, "epoch": 0.09512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4806244374863733e-06, "loss": 0.0, "num_tokens": 49901044.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1093798279762268, "epoch": 0.0952, "grad_norm": 0.0, "learning_rate": 3.4805556745105073e-06, "loss": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 122.796875, "completions/mean_terminated_length": 122.796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.11774209141731262, "epoch": 0.09528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.480486790414023e-06, "loss": 0.0, "num_tokens": 49982298.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12071120366454124, "epoch": 0.09536, "grad_norm": 0.0, "learning_rate": 3.48041778520174e-06, "loss": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 113.4453125, "completions/mean_terminated_length": 108.84677124023438, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10172612220048904, "epoch": 0.09544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.480348658878489e-06, "loss": 0.0, "num_tokens": 50062355.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10787924006581306, "epoch": 0.09552, "grad_norm": 0.0, "learning_rate": 3.4802794114491078e-06, "loss": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 119.7421875, "completions/mean_terminated_length": 118.6692886352539, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.11306428536772728, "epoch": 0.0956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.480210042918443e-06, "loss": 0.0, "num_tokens": 50143218.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11236975714564323, "epoch": 0.09568, "grad_norm": 0.0, "learning_rate": 3.4801405532913492e-06, "loss": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 114.2890625, "completions/mean_terminated_length": 114.2890625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.11935004219412804, "epoch": 0.09576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.480070942572691e-06, "loss": 0.0, "num_tokens": 50223383.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10992903634905815, "epoch": 0.09584, "grad_norm": 0.0, "learning_rate": 3.4800012107673395e-06, "loss": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 120.4296875, "completions/mean_terminated_length": 120.4296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.11681755632162094, "epoch": 0.09592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4799313578801757e-06, "loss": 0.0, "num_tokens": 50304334.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11982588097453117, "epoch": 0.096, "grad_norm": 0.0, "learning_rate": 3.4798613839160887e-06, "loss": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 115.1171875, "completions/mean_terminated_length": 115.1171875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.11499980837106705, "epoch": 0.09608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.479791288879975e-06, "loss": 0.0, "num_tokens": 50384605.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12730321288108826, "epoch": 0.09616, "grad_norm": 0.0, "learning_rate": 3.4797210727767416e-06, "loss": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 108.9609375, "completions/mean_terminated_length": 108.9609375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10546886920928955, "epoch": 0.09624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.479650735611302e-06, "loss": 0.0, "num_tokens": 50464088.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11147931218147278, "epoch": 0.09632, "grad_norm": 0.0, "learning_rate": 3.4795802773885798e-06, "loss": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 114.234375, "completions/mean_terminated_length": 114.234375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.09920010343194008, "epoch": 0.0964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.479509698113506e-06, "loss": 0.0, "num_tokens": 50544246.0, "reward": 0.05776464566588402, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05776464566588402, "rewards/reward_fn/std": 0.15343140065670013, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10052594915032387, "epoch": 0.09648, "grad_norm": 0.0, "learning_rate": 3.47943899779102e-06, "loss": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 134.90625, "completions/mean_terminated_length": 132.984130859375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1209162287414074, "epoch": 0.09656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.479368176426071e-06, "loss": 0.0, "num_tokens": 50627050.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1164884865283966, "epoch": 0.09664, "grad_norm": 0.0, "learning_rate": 3.4792972340236146e-06, "loss": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.10100126639008522, "epoch": 0.09672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4792261705886175e-06, "loss": 0.0, "num_tokens": 50709394.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10940065234899521, "epoch": 0.0968, "grad_norm": 0.0, "learning_rate": 3.479154986126052e-06, "loss": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 110.328125, "completions/mean_terminated_length": 110.328125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.113831777125597, "epoch": 0.09688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4790836806409007e-06, "loss": 0.0, "num_tokens": 50789052.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1058012992143631, "epoch": 0.09696, "grad_norm": 0.0, "learning_rate": 3.4790122541381543e-06, "loss": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 105.1171875, "completions/mean_terminated_length": 105.1171875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.11390942707657814, "epoch": 0.09704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4789407066228126e-06, "loss": 0.0, "num_tokens": 50868043.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10817180946469307, "epoch": 0.09712, "grad_norm": 0.0, "learning_rate": 3.4788690380998817e-06, "loss": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 102.234375, "completions/mean_terminated_length": 101.02362060546875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.11102504655718803, "epoch": 0.0972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4787972485743788e-06, "loss": 0.0, "num_tokens": 50946665.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11188415437936783, "epoch": 0.09728, "grad_norm": 0.0, "learning_rate": 3.478725338051328e-06, "loss": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.4921875, "completions/mean_terminated_length": 121.4921875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.09207893908023834, "epoch": 0.09736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4786533065357624e-06, "loss": 0.0, "num_tokens": 51027752.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09277597442269325, "epoch": 0.09744, "grad_norm": 0.0, "learning_rate": 3.4785811540327235e-06, "loss": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 116.1953125, "completions/mean_terminated_length": 116.1953125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.10693808645009995, "epoch": 0.09752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4785088805472606e-06, "loss": 0.0, "num_tokens": 51108161.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11056554690003395, "epoch": 0.0976, "grad_norm": 0.0, "learning_rate": 3.478436486084433e-06, "loss": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 108.71875, "completions/mean_terminated_length": 108.71875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.13131405413150787, "epoch": 0.09768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.478363970649307e-06, "loss": 0.0, "num_tokens": 51187613.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1363961026072502, "epoch": 0.09776, "grad_norm": 0.0, "learning_rate": 3.478291334246958e-06, "loss": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 119.6171875, "completions/mean_terminated_length": 117.45238494873047, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.1168687455356121, "epoch": 0.09784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.47821857688247e-06, "loss": 0.0, "num_tokens": 51268460.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12124114483594894, "epoch": 0.09792, "grad_norm": 0.0, "learning_rate": 3.478145698560935e-06, "loss": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 111.9609375, "completions/mean_terminated_length": 111.9609375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.1217251755297184, "epoch": 0.098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4780726992874533e-06, "loss": 0.0, "num_tokens": 51348327.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12292628362774849, "epoch": 0.09808, "grad_norm": 0.0, "learning_rate": 3.477999579067135e-06, "loss": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 108.0703125, "completions/mean_terminated_length": 108.0703125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1362610012292862, "epoch": 0.09816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4779263379050974e-06, "loss": 0.0, "num_tokens": 51427696.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13092228770256042, "epoch": 0.09824, "grad_norm": 0.0, "learning_rate": 3.4778529758064664e-06, "loss": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 107.2109375, "completions/mean_terminated_length": 107.2109375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11340505257248878, "epoch": 0.09832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.477779492776376e-06, "loss": 0.0, "num_tokens": 51506955.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09971309453248978, "epoch": 0.0984, "grad_norm": 0.0, "learning_rate": 3.4777058888199706e-06, "loss": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 128.7559051513672, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.11246441304683685, "epoch": 0.09848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.477632163942401e-06, "loss": 0.0, "num_tokens": 51589099.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11392981559038162, "epoch": 0.09856, "grad_norm": 0.0, "learning_rate": 3.477558318148827e-06, "loss": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 129.2578125, "completions/mean_terminated_length": 117.34188842773438, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.13410558551549911, "epoch": 0.09864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4774843514444173e-06, "loss": 0.0, "num_tokens": 51671180.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12574894726276398, "epoch": 0.09872, "grad_norm": 0.0, "learning_rate": 3.4774102638343486e-06, "loss": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 111.5625, "completions/mean_terminated_length": 111.5625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.10349995270371437, "epoch": 0.0988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4773360553238065e-06, "loss": 0.0, "num_tokens": 51750996.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10114230215549469, "epoch": 0.09888, "grad_norm": 0.0, "learning_rate": 3.4772617259179844e-06, "loss": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 111.2890625, "completions/mean_terminated_length": 110.14960479736328, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.09998670220375061, "epoch": 0.09896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4771872756220848e-06, "loss": 0.0, "num_tokens": 51830777.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10607966035604477, "epoch": 0.09904, "grad_norm": 0.0, "learning_rate": 3.477112704441318e-06, "loss": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 122.2890625, "completions/mean_terminated_length": 120.16667175292969, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11573387682437897, "epoch": 0.09912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4770380123809045e-06, "loss": 0.0, "num_tokens": 51911966.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11826275289058685, "epoch": 0.0992, "grad_norm": 0.0, "learning_rate": 3.476963199446071e-06, "loss": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 119.9765625, "completions/mean_terminated_length": 114.44715118408203, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.12798530980944633, "epoch": 0.09928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4768882656420533e-06, "loss": 0.0, "num_tokens": 51992859.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11744630709290504, "epoch": 0.09936, "grad_norm": 0.0, "learning_rate": 3.476813210974097e-06, "loss": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 106.2890625, "completions/mean_terminated_length": 106.2890625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11053912341594696, "epoch": 0.09944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.476738035447454e-06, "loss": 0.0, "num_tokens": 52072000.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10301650315523148, "epoch": 0.09952, "grad_norm": 0.0, "learning_rate": 3.4766627390673863e-06, "loss": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 113.609375, "completions/mean_terminated_length": 113.609375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.09974493831396103, "epoch": 0.0996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.476587321839164e-06, "loss": 0.0, "num_tokens": 52152078.0, "reward": 0.7477015852928162, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7477015852928162, "rewards/reward_fn/std": 1.0934940576553345, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10195750743150711, "epoch": 0.09968, "grad_norm": 0.0, "learning_rate": 3.4765117837680655e-06, "loss": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 111.2890625, "completions/mean_terminated_length": 111.2890625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1257324442267418, "epoch": 0.09976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4764361248593772e-06, "loss": 0.0, "num_tokens": 52231859.0, "reward": 0.8206124305725098, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8206124305725098, "rewards/reward_fn/std": 1.2764060497283936, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12519625574350357, "epoch": 0.09984, "grad_norm": 0.0, "learning_rate": 3.476360345118395e-06, "loss": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 102.7109375, "completions/mean_terminated_length": 102.7109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.10776703432202339, "epoch": 0.09992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4762844445504227e-06, "loss": 0.0, "num_tokens": 52310542.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09987030550837517, "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 3.476208423160772e-06, "loss": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 118.4296875, "completions/mean_terminated_length": 118.4296875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.11423780769109726, "epoch": 0.10008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.476132280954764e-06, "loss": 0.0, "num_tokens": 52391237.0, "reward": 0.07864314317703247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07864314317703247, "rewards/reward_fn/std": 0.20888777077198029, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1180795207619667, "epoch": 0.10016, "grad_norm": 0.0, "learning_rate": 3.4760560179377274e-06, "loss": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 119.2109375, "completions/mean_terminated_length": 115.92800903320312, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11545830592513084, "epoch": 0.10024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4759796341150007e-06, "loss": 0.0, "num_tokens": 52472032.0, "reward": 0.09519927203655243, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09519927203655243, "rewards/reward_fn/std": 0.2528632879257202, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1201523020863533, "epoch": 0.10032, "grad_norm": 0.0, "learning_rate": 3.4759031294919295e-06, "loss": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 107.7578125, "completions/mean_terminated_length": 106.5905532836914, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12413407117128372, "epoch": 0.1004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4758265040738675e-06, "loss": 0.0, "num_tokens": 52551361.0, "reward": 0.46453723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46453723311424255, "rewards/reward_fn/std": 0.990456759929657, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13203085213899612, "epoch": 0.10048, "grad_norm": 0.0, "learning_rate": 3.475749757866179e-06, "loss": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 121.296875, "completions/mean_terminated_length": 121.296875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.09250392019748688, "epoch": 0.10056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4756728908742343e-06, "loss": 0.0, "num_tokens": 52632423.0, "reward": 0.4203384220600128, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4203384220600128, "rewards/reward_fn/std": 0.9860814809799194, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09673825651407242, "epoch": 0.10064, "grad_norm": 0.0, "learning_rate": 3.475595903103414e-06, "loss": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 129.5078125, "completions/mean_terminated_length": 126.47200775146484, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.10168031975626945, "epoch": 0.10072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4755187945591063e-06, "loss": 0.0, "num_tokens": 52714536.0, "reward": 0.1051296517252922, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1051296517252922, "rewards/reward_fn/std": 0.27173131704330444, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10438243672251701, "epoch": 0.1008, "grad_norm": 0.0, "learning_rate": 3.475441565246708e-06, "loss": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 102.9609375, "completions/mean_terminated_length": 102.9609375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.09521085768938065, "epoch": 0.10088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4753642151716242e-06, "loss": 0.0, "num_tokens": 52793251.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09620696306228638, "epoch": 0.10096, "grad_norm": 0.0, "learning_rate": 3.4752867443392682e-06, "loss": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 128.453125, "completions/mean_terminated_length": 128.453125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.10782148689031601, "epoch": 0.10104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.475209152755063e-06, "loss": 0.0, "num_tokens": 52875229.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10675635561347008, "epoch": 0.10112, "grad_norm": 0.0, "learning_rate": 3.4751314404244386e-06, "loss": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 122.1953125, "completions/mean_terminated_length": 114.45454406738281, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10004638135433197, "epoch": 0.1012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4750536073528345e-06, "loss": 0.0, "num_tokens": 52956406.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11263909190893173, "epoch": 0.10128, "grad_norm": 0.0, "learning_rate": 3.4749756535456974e-06, "loss": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 125.28125, "completions/mean_terminated_length": 122.14400482177734, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.10273288935422897, "epoch": 0.10136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4748975790084845e-06, "loss": 0.0, "num_tokens": 53037978.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10738789662718773, "epoch": 0.10144, "grad_norm": 0.0, "learning_rate": 3.4748193837466586e-06, "loss": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 115.5703125, "completions/mean_terminated_length": 115.5703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.11131062358617783, "epoch": 0.10152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4747410677656936e-06, "loss": 0.0, "num_tokens": 53118307.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11085333675146103, "epoch": 0.1016, "grad_norm": 0.0, "learning_rate": 3.4746626310710708e-06, "loss": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 124.0078125, "completions/mean_terminated_length": 124.0078125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.10745617374777794, "epoch": 0.10168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4745840736682796e-06, "loss": 0.0, "num_tokens": 53199716.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11472092196345329, "epoch": 0.10176, "grad_norm": 0.0, "learning_rate": 3.4745053955628184e-06, "loss": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 112.3671875, "completions/mean_terminated_length": 112.3671875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.0963834747672081, "epoch": 0.10184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4744265967601935e-06, "loss": 0.0, "num_tokens": 53279635.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10226929932832718, "epoch": 0.10192, "grad_norm": 0.0, "learning_rate": 3.4743476772659205e-06, "loss": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 124.1640625, "completions/mean_terminated_length": 122.0714340209961, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.10619049519300461, "epoch": 0.102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.474268637085522e-06, "loss": 0.0, "num_tokens": 53361064.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11042249202728271, "epoch": 0.10208, "grad_norm": 0.0, "learning_rate": 3.474189476224531e-06, "loss": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 93.375, "completions/mean_terminated_length": 93.375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.11370223388075829, "epoch": 0.10216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4741101946884877e-06, "loss": 0.0, "num_tokens": 53438552.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.110554538667202, "epoch": 0.10224, "grad_norm": 0.0, "learning_rate": 3.474030792482941e-06, "loss": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 113.4921875, "completions/mean_terminated_length": 113.4921875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.09807180240750313, "epoch": 0.10232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4739512696134478e-06, "loss": 0.0, "num_tokens": 53518615.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10183721408247948, "epoch": 0.1024, "grad_norm": 0.0, "learning_rate": 3.473871626085574e-06, "loss": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 121.890625, "completions/mean_terminated_length": 121.890625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.11055203154683113, "epoch": 0.10248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.473791861904894e-06, "loss": 0.0, "num_tokens": 53599753.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11396090686321259, "epoch": 0.10256, "grad_norm": 0.0, "learning_rate": 3.4737119770769902e-06, "loss": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 112.4921875, "completions/mean_terminated_length": 112.4921875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.09983281046152115, "epoch": 0.10264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.473631971607454e-06, "loss": 0.0, "num_tokens": 53679688.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10282871127128601, "epoch": 0.10272, "grad_norm": 0.0, "learning_rate": 3.4735518455018846e-06, "loss": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 117.8359375, "completions/mean_terminated_length": 117.8359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.10669086128473282, "epoch": 0.1028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.47347159876589e-06, "loss": 0.0, "num_tokens": 53760307.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11259639635682106, "epoch": 0.10288, "grad_norm": 0.0, "learning_rate": 3.4733912314050873e-06, "loss": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 105.203125, "completions/mean_terminated_length": 105.203125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.10973065346479416, "epoch": 0.10296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4733107434251004e-06, "loss": 0.0, "num_tokens": 53839309.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11743086948990822, "epoch": 0.10304, "grad_norm": 0.0, "learning_rate": 3.4732301348315635e-06, "loss": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 122.34375, "completions/mean_terminated_length": 116.91056823730469, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.10903063789010048, "epoch": 0.10312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4731494056301178e-06, "loss": 0.0, "num_tokens": 53920505.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10703249648213387, "epoch": 0.1032, "grad_norm": 0.0, "learning_rate": 3.473068555826414e-06, "loss": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.1074083223938942, "epoch": 0.10328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4729875854261095e-06, "loss": 0.0, "num_tokens": 54001753.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10468226298689842, "epoch": 0.10336, "grad_norm": 0.0, "learning_rate": 3.4729064944348725e-06, "loss": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 116.9609375, "completions/mean_terminated_length": 116.9609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0879639983177185, "epoch": 0.10344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472825282858378e-06, "loss": 0.0, "num_tokens": 54082260.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08952264487743378, "epoch": 0.10352, "grad_norm": 0.0, "learning_rate": 3.4727439507023104e-06, "loss": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 104.1796875, "completions/mean_terminated_length": 104.1796875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.0991598442196846, "epoch": 0.1036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472662497972362e-06, "loss": 0.0, "num_tokens": 54161131.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09952304884791374, "epoch": 0.10368, "grad_norm": 0.0, "learning_rate": 3.472580924674234e-06, "loss": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 110.5, "completions/mean_terminated_length": 110.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.11977525800466537, "epoch": 0.10376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472499230813634e-06, "loss": 0.0, "num_tokens": 54240811.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10997357964515686, "epoch": 0.10384, "grad_norm": 0.0, "learning_rate": 3.4724174163962815e-06, "loss": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 118.3515625, "completions/mean_terminated_length": 118.3515625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.10858000069856644, "epoch": 0.10392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472335481427902e-06, "loss": 0.0, "num_tokens": 54321496.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11103647202253342, "epoch": 0.104, "grad_norm": 0.0, "learning_rate": 3.4722534259142303e-06, "loss": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 122.671875, "completions/mean_terminated_length": 119.47200775146484, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.11343058198690414, "epoch": 0.10408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472171249861009e-06, "loss": 0.0, "num_tokens": 54402734.0, "reward": 0.3948310613632202, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3948310613632202, "rewards/reward_fn/std": 0.9899041056632996, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11205988749861717, "epoch": 0.10416, "grad_norm": 0.0, "learning_rate": 3.4720889532739896e-06, "loss": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 122.0859375, "completions/mean_terminated_length": 122.0859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.10193099081516266, "epoch": 0.10424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.472006536158932e-06, "loss": 0.0, "num_tokens": 54483897.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09957262128591537, "epoch": 0.10432, "grad_norm": 0.0, "learning_rate": 3.471923998521605e-06, "loss": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 115.9296875, "completions/mean_terminated_length": 110.23577117919922, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.12731429934501648, "epoch": 0.1044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4718413403677855e-06, "loss": 0.0, "num_tokens": 54564272.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13666541874408722, "epoch": 0.10448, "grad_norm": 0.0, "learning_rate": 3.4717585617032574e-06, "loss": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 124.640625, "completions/mean_terminated_length": 107.20353698730469, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1304742842912674, "epoch": 0.10456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.471675662533816e-06, "loss": 0.0, "num_tokens": 54645762.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14192385226488113, "epoch": 0.10464, "grad_norm": 0.0, "learning_rate": 3.471592642865262e-06, "loss": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 104.109375, "completions/mean_terminated_length": 99.20967102050781, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.12140092253684998, "epoch": 0.10472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4715095027034067e-06, "loss": 0.0, "num_tokens": 54724624.0, "reward": 0.4287605285644531, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4287605285644531, "rewards/reward_fn/std": 0.9858390688896179, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1307300627231598, "epoch": 0.1048, "grad_norm": 0.0, "learning_rate": 3.471426242054069e-06, "loss": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 136.3515625, "completions/mean_terminated_length": 132.49192810058594, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.11742405220866203, "epoch": 0.10488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4713428609230755e-06, "loss": 0.0, "num_tokens": 54807613.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12283401191234589, "epoch": 0.10496, "grad_norm": 0.0, "learning_rate": 3.4712593593162632e-06, "loss": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 125.953125, "completions/mean_terminated_length": 122.8320083618164, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.11513060703873634, "epoch": 0.10504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.471175737239475e-06, "loss": 0.0, "num_tokens": 54889271.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11927731335163116, "epoch": 0.10512, "grad_norm": 0.0, "learning_rate": 3.471091994698565e-06, "loss": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 117.3984375, "completions/mean_terminated_length": 116.30708312988281, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.11793725937604904, "epoch": 0.1052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.471008131699393e-06, "loss": 0.0, "num_tokens": 54969834.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11248406767845154, "epoch": 0.10528, "grad_norm": 0.0, "learning_rate": 3.4709241482478287e-06, "loss": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 102.625, "completions/mean_terminated_length": 102.625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.0984947681427002, "epoch": 0.10536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4708400443497508e-06, "loss": 0.0, "num_tokens": 55048506.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09634802490472794, "epoch": 0.10544, "grad_norm": 0.0, "learning_rate": 3.4707558200110454e-06, "loss": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 91.796875, "completions/mean_terminated_length": 91.796875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.13032419234514236, "epoch": 0.10552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.470671475237607e-06, "loss": 0.0, "num_tokens": 55125792.0, "reward": 0.41141408681869507, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41141408681869507, "rewards/reward_fn/std": 0.9868917465209961, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12809529155492783, "epoch": 0.1056, "grad_norm": 0.0, "learning_rate": 3.47058701003534e-06, "loss": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 108.5390625, "completions/mean_terminated_length": 108.5390625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.1092277318239212, "epoch": 0.10568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.470502424410154e-06, "loss": 0.0, "num_tokens": 55205221.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10664983466267586, "epoch": 0.10576, "grad_norm": 0.0, "learning_rate": 3.4704177183679705e-06, "loss": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 118.1875, "completions/mean_terminated_length": 112.58536529541016, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.12635096907615662, "epoch": 0.10584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.470332891914718e-06, "loss": 0.0, "num_tokens": 55285885.0, "reward": 0.1743069887161255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1743069887161255, "rewards/reward_fn/std": 0.3199642300605774, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12400953099131584, "epoch": 0.10592, "grad_norm": 0.0, "learning_rate": 3.470247945056333e-06, "loss": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 117.9140625, "completions/mean_terminated_length": 117.9140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.09528264030814171, "epoch": 0.106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4701628777987615e-06, "loss": 0.0, "num_tokens": 55366514.0, "reward": 0.0688910037279129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0688910037279129, "rewards/reward_fn/std": 0.18298465013504028, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09694351255893707, "epoch": 0.10608, "grad_norm": 0.0, "learning_rate": 3.4700776901479565e-06, "loss": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 111.0703125, "completions/mean_terminated_length": 111.0703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1240697093307972, "epoch": 0.10616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.469992382109881e-06, "loss": 0.0, "num_tokens": 55446267.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13602536171674728, "epoch": 0.10624, "grad_norm": 0.0, "learning_rate": 3.4699069536905044e-06, "loss": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 108.1953125, "completions/mean_terminated_length": 107.031494140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.13135579973459244, "epoch": 0.10632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4698214048958076e-06, "loss": 0.0, "num_tokens": 55525652.0, "reward": 0.39967191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.39967191219329834, "rewards/reward_fn/std": 0.988822877407074, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1283053383231163, "epoch": 0.1064, "grad_norm": 0.0, "learning_rate": 3.469735735731777e-06, "loss": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 98.9375, "completions/mean_terminated_length": 98.9375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.12107482552528381, "epoch": 0.10648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4696499462044084e-06, "loss": 0.0, "num_tokens": 55603852.0, "reward": 0.08438373357057571, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08438373357057571, "rewards/reward_fn/std": 0.224135622382164, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12246451526880264, "epoch": 0.10656, "grad_norm": 0.0, "learning_rate": 3.4695640363197073e-06, "loss": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 108.4609375, "completions/mean_terminated_length": 108.4609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.1271132528781891, "epoch": 0.10664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4694780060836848e-06, "loss": 0.0, "num_tokens": 55683271.0, "reward": 0.06162349507212639, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06162349507212639, "rewards/reward_fn/std": 0.16368108987808228, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12170031666755676, "epoch": 0.10672, "grad_norm": 0.0, "learning_rate": 3.4693918555023634e-06, "loss": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 114.7578125, "completions/mean_terminated_length": 114.7578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.11193994805216789, "epoch": 0.1068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.469305584581772e-06, "loss": 0.0, "num_tokens": 55763496.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11903609707951546, "epoch": 0.10688, "grad_norm": 0.0, "learning_rate": 3.4692191933279494e-06, "loss": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 121.953125, "completions/mean_terminated_length": 121.953125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.11467370390892029, "epoch": 0.10696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.469132681746942e-06, "loss": 0.0, "num_tokens": 55844642.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11534377560019493, "epoch": 0.10704, "grad_norm": 0.0, "learning_rate": 3.4690460498448034e-06, "loss": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 103.03125, "completions/mean_terminated_length": 101.82677459716797, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.11732485890388489, "epoch": 0.10712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4689592976275983e-06, "loss": 0.0, "num_tokens": 55923366.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1238110400736332, "epoch": 0.1072, "grad_norm": 0.0, "learning_rate": 3.468872425101398e-06, "loss": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 122.6484375, "completions/mean_terminated_length": 121.59842681884766, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.10521713644266129, "epoch": 0.10728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.468785432272283e-06, "loss": 0.0, "num_tokens": 56004601.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10392987728118896, "epoch": 0.10736, "grad_norm": 0.0, "learning_rate": 3.4686983191463413e-06, "loss": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.3046875, "completions/mean_terminated_length": 120.3046875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.11054778471589088, "epoch": 0.10744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4686110857296698e-06, "loss": 0.0, "num_tokens": 56085536.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11398602277040482, "epoch": 0.10752, "grad_norm": 0.0, "learning_rate": 3.468523732028375e-06, "loss": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 99.1015625, "completions/mean_terminated_length": 99.1015625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.11284170672297478, "epoch": 0.1076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.468436258048569e-06, "loss": 0.0, "num_tokens": 56163757.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11728502064943314, "epoch": 0.10768, "grad_norm": 0.0, "learning_rate": 3.4683486637963756e-06, "loss": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 112.7890625, "completions/mean_terminated_length": 109.35200500488281, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.11876781657338142, "epoch": 0.10776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4682609492779255e-06, "loss": 0.0, "num_tokens": 56243730.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11198745667934418, "epoch": 0.10784, "grad_norm": 0.0, "learning_rate": 3.4681731144993566e-06, "loss": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 112.671875, "completions/mean_terminated_length": 112.671875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.12260394170880318, "epoch": 0.10792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4680851594668167e-06, "loss": 0.0, "num_tokens": 56323688.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11703531071543694, "epoch": 0.108, "grad_norm": 0.0, "learning_rate": 3.4679970841864625e-06, "loss": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 101.1875, "completions/mean_terminated_length": 101.1875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16271410882472992, "epoch": 0.10808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4679088886644583e-06, "loss": 0.0, "num_tokens": 56402176.0, "reward": 0.45513463020324707, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45513463020324707, "rewards/reward_fn/std": 0.988360583782196, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15878260880708694, "epoch": 0.10816, "grad_norm": 0.0, "learning_rate": 3.467820572906976e-06, "loss": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 116.4375, "completions/mean_terminated_length": 116.4375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.10187884047627449, "epoch": 0.10824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4677321369201966e-06, "loss": 0.0, "num_tokens": 56482616.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10231256112456322, "epoch": 0.10832, "grad_norm": 0.0, "learning_rate": 3.467643580710311e-06, "loss": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 118.6171875, "completions/mean_terminated_length": 114.18547821044922, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.12054432928562164, "epoch": 0.1084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4675549042835163e-06, "loss": 0.0, "num_tokens": 56563335.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12073773890733719, "epoch": 0.10848, "grad_norm": 0.0, "learning_rate": 3.467466107646019e-06, "loss": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 116.4140625, "completions/mean_terminated_length": 114.19841766357422, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.11313008889555931, "epoch": 0.10856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4673771908040343e-06, "loss": 0.0, "num_tokens": 56643772.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11546922475099564, "epoch": 0.10864, "grad_norm": 0.0, "learning_rate": 3.4672881537637856e-06, "loss": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 110.390625, "completions/mean_terminated_length": 110.390625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.09868573769927025, "epoch": 0.10872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.467198996531503e-06, "loss": 0.0, "num_tokens": 56723438.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09933455660939217, "epoch": 0.1088, "grad_norm": 0.0, "learning_rate": 3.4671097191134283e-06, "loss": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 107.6328125, "completions/mean_terminated_length": 106.46456909179688, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.11059986054897308, "epoch": 0.10888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4670203215158095e-06, "loss": 0.0, "num_tokens": 56802751.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10829007998108864, "epoch": 0.10896, "grad_norm": 0.0, "learning_rate": 3.4669308037449035e-06, "loss": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 112.8046875, "completions/mean_terminated_length": 112.8046875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1284579113125801, "epoch": 0.10904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4668411658069748e-06, "loss": 0.0, "num_tokens": 56882726.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12365762889385223, "epoch": 0.10912, "grad_norm": 0.0, "learning_rate": 3.4667514077082983e-06, "loss": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 124.6796875, "completions/mean_terminated_length": 123.64566802978516, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.10620241239666939, "epoch": 0.1092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.466661529455155e-06, "loss": 0.0, "num_tokens": 56964221.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10639988258481026, "epoch": 0.10928, "grad_norm": 0.0, "learning_rate": 3.4665715310538367e-06, "loss": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 115.2109375, "completions/mean_terminated_length": 115.2109375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.1043931245803833, "epoch": 0.10936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4664814125106415e-06, "loss": 0.0, "num_tokens": 57044504.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1046062670648098, "epoch": 0.10944, "grad_norm": 0.0, "learning_rate": 3.4663911738318766e-06, "loss": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 114.6796875, "completions/mean_terminated_length": 113.56692504882812, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.10956894978880882, "epoch": 0.10952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4663008150238582e-06, "loss": 0.0, "num_tokens": 57124719.0, "reward": 0.4347124993801117, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4347124993801117, "rewards/reward_fn/std": 0.9859739542007446, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12244190275669098, "epoch": 0.1096, "grad_norm": 0.0, "learning_rate": 3.466210336092911e-06, "loss": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 115.2421875, "completions/mean_terminated_length": 115.2421875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.10731690376996994, "epoch": 0.10968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4661197370453655e-06, "loss": 0.0, "num_tokens": 57205006.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10656939074397087, "epoch": 0.10976, "grad_norm": 0.0, "learning_rate": 3.466029017887565e-06, "loss": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 98.71875, "completions/mean_terminated_length": 98.71875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.12883716449141502, "epoch": 0.10984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4659381786258577e-06, "loss": 0.0, "num_tokens": 57283178.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12416808307170868, "epoch": 0.10992, "grad_norm": 0.0, "learning_rate": 3.4658472192666017e-06, "loss": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 112.859375, "completions/mean_terminated_length": 110.58731079101562, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.11053011566400528, "epoch": 0.11, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4657561398161626e-06, "loss": 0.0, "num_tokens": 57363160.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10882564261555672, "epoch": 0.11008, "grad_norm": 0.0, "learning_rate": 3.4656649402809164e-06, "loss": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 122.8515625, "completions/mean_terminated_length": 120.73810577392578, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.11811903491616249, "epoch": 0.11016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4655736206672446e-06, "loss": 0.0, "num_tokens": 57444421.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11450444534420967, "epoch": 0.11024, "grad_norm": 0.0, "learning_rate": 3.4654821809815395e-06, "loss": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 105.7734375, "completions/mean_terminated_length": 105.7734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.12743107229471207, "epoch": 0.11032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4653906212302008e-06, "loss": 0.0, "num_tokens": 57523496.0, "reward": 0.76492840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.76492840051651, "rewards/reward_fn/std": 1.296067476272583, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11739801615476608, "epoch": 0.1104, "grad_norm": 0.0, "learning_rate": 3.465298941419636e-06, "loss": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 111.9296875, "completions/mean_terminated_length": 109.64286041259766, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11905653402209282, "epoch": 0.11048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4652071415562625e-06, "loss": 0.0, "num_tokens": 57603359.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11805332824587822, "epoch": 0.11056, "grad_norm": 0.0, "learning_rate": 3.4651152216465052e-06, "loss": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 106.1796875, "completions/mean_terminated_length": 105.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.10950208082795143, "epoch": 0.11064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4650231816967977e-06, "loss": 0.0, "num_tokens": 57682486.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11595014855265617, "epoch": 0.11072, "grad_norm": 0.0, "learning_rate": 3.4649310217135815e-06, "loss": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 105.6875, "completions/mean_terminated_length": 105.6875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.10882537439465523, "epoch": 0.1108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4648387417033064e-06, "loss": 0.0, "num_tokens": 57761550.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10971024632453918, "epoch": 0.11088, "grad_norm": 0.0, "learning_rate": 3.464746341672432e-06, "loss": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 115.109375, "completions/mean_terminated_length": 115.109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.09582418948411942, "epoch": 0.11096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4646538216274246e-06, "loss": 0.0, "num_tokens": 57841820.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09414370730519295, "epoch": 0.11104, "grad_norm": 0.0, "learning_rate": 3.46456118157476e-06, "loss": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 123.828125, "completions/mean_terminated_length": 123.828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.11982530727982521, "epoch": 0.11112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4644684215209216e-06, "loss": 0.0, "num_tokens": 57923206.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11702412739396095, "epoch": 0.1112, "grad_norm": 0.0, "learning_rate": 3.464375541472402e-06, "loss": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 112.046875, "completions/mean_terminated_length": 112.046875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.09603938460350037, "epoch": 0.11128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4642825414357016e-06, "loss": 0.0, "num_tokens": 58003084.0, "reward": 0.06122390925884247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06122390925884247, "rewards/reward_fn/std": 0.10656410455703735, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09413010254502296, "epoch": 0.11136, "grad_norm": 0.0, "learning_rate": 3.46418942141733e-06, "loss": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 113.8828125, "completions/mean_terminated_length": 113.8828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.12109982222318649, "epoch": 0.11144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4640961814238036e-06, "loss": 0.0, "num_tokens": 58083197.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12424036487936974, "epoch": 0.11152, "grad_norm": 0.0, "learning_rate": 3.4640028214616492e-06, "loss": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 113.2421875, "completions/mean_terminated_length": 113.2421875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.11244867369532585, "epoch": 0.1116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4639093415374004e-06, "loss": 0.0, "num_tokens": 58163228.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11519478261470795, "epoch": 0.11168, "grad_norm": 0.0, "learning_rate": 3.4638157416576e-06, "loss": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 115.59375, "completions/mean_terminated_length": 115.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.10606054216623306, "epoch": 0.11176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.463722021828799e-06, "loss": 0.0, "num_tokens": 58243560.0, "reward": 0.11908485740423203, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11908485740423203, "rewards/reward_fn/std": 0.3163069188594818, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10107476264238358, "epoch": 0.11184, "grad_norm": 0.0, "learning_rate": 3.463628182057557e-06, "loss": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.0881023071706295, "epoch": 0.11192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4635342223504415e-06, "loss": 0.0, "num_tokens": 58324152.0, "reward": 0.07061244547367096, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07061244547367096, "rewards/reward_fn/std": 0.18755705654621124, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08910387009382248, "epoch": 0.112, "grad_norm": 0.0, "learning_rate": 3.4634401427140283e-06, "loss": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 116.4453125, "completions/mean_terminated_length": 116.4453125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.10629783570766449, "epoch": 0.11208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.463345943154903e-06, "loss": 0.0, "num_tokens": 58404593.0, "reward": 0.12013623118400574, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12013623118400574, "rewards/reward_fn/std": 0.3190995156764984, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1145402230322361, "epoch": 0.11216, "grad_norm": 0.0, "learning_rate": 3.463251623679658e-06, "loss": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 128.94488525390625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.1022961400449276, "epoch": 0.11224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4631571842948948e-06, "loss": 0.0, "num_tokens": 58486761.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09702368453145027, "epoch": 0.11232, "grad_norm": 0.0, "learning_rate": 3.463062625007223e-06, "loss": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 120.98359680175781, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.13086721301078796, "epoch": 0.1124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4629679458232603e-06, "loss": 0.0, "num_tokens": 58568593.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12160508707165718, "epoch": 0.11248, "grad_norm": 0.0, "learning_rate": 3.462873146749634e-06, "loss": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 113.984375, "completions/mean_terminated_length": 97.93042755126953, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.14250093698501587, "epoch": 0.11256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.462778227792979e-06, "loss": 0.0, "num_tokens": 58648719.0, "reward": 0.49987494945526123, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49987494945526123, "rewards/reward_fn/std": 1.0038665533065796, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14612042903900146, "epoch": 0.11264, "grad_norm": 0.0, "learning_rate": 3.4626831889599385e-06, "loss": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 116.609375, "completions/mean_terminated_length": 116.609375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.10210934653878212, "epoch": 0.11272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4625880302571635e-06, "loss": 0.0, "num_tokens": 58729181.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10653179883956909, "epoch": 0.1128, "grad_norm": 0.0, "learning_rate": 3.462492751691315e-06, "loss": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 123.8671875, "completions/mean_terminated_length": 121.76985168457031, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.1258917674422264, "epoch": 0.11288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.462397353269062e-06, "loss": 0.0, "num_tokens": 58810572.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11735067516565323, "epoch": 0.11296, "grad_norm": 0.0, "learning_rate": 3.4623018349970797e-06, "loss": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 99.6640625, "completions/mean_terminated_length": 99.6640625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1103239431977272, "epoch": 0.11304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4622061968820545e-06, "loss": 0.0, "num_tokens": 58888865.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10473310574889183, "epoch": 0.11312, "grad_norm": 0.0, "learning_rate": 3.4621104389306802e-06, "loss": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 101.28125, "completions/mean_terminated_length": 101.28125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.12672145664691925, "epoch": 0.1132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.462014561149658e-06, "loss": 0.0, "num_tokens": 58967365.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11699289083480835, "epoch": 0.11328, "grad_norm": 0.0, "learning_rate": 3.4619185635456992e-06, "loss": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 110.8828125, "completions/mean_terminated_length": 109.74015808105469, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.12438993901014328, "epoch": 0.11336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.461822446125522e-06, "loss": 0.0, "num_tokens": 59047094.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14683134481310844, "epoch": 0.11344, "grad_norm": 0.0, "learning_rate": 3.461726208895854e-06, "loss": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 111.9453125, "completions/mean_terminated_length": 110.81101989746094, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.11764026433229446, "epoch": 0.11352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.46162985186343e-06, "loss": 0.0, "num_tokens": 59126959.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11927098408341408, "epoch": 0.1136, "grad_norm": 0.0, "learning_rate": 3.461533375034995e-06, "loss": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 118.15625, "completions/mean_terminated_length": 118.15625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.1095087043941021, "epoch": 0.11368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4614367784173016e-06, "loss": 0.0, "num_tokens": 59207619.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11131378263235092, "epoch": 0.11376, "grad_norm": 0.0, "learning_rate": 3.4613400620171093e-06, "loss": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 120.203125, "completions/mean_terminated_length": 116.94400787353516, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.10387607663869858, "epoch": 0.11384, "frac_reward_zero_std": 0.875, "grad_norm": 0.21302951872348785, "learning_rate": 3.4612432258411876e-06, "loss": -0.0152, "num_tokens": 59288541.0, "reward": 0.3287436366081238, "reward_std": 0.16770508885383606, "rewards/reward_fn/mean": 0.3287436366081238, "rewards/reward_fn/std": 0.8715833425521851, "step": 1423 }, { "clip_ratio/high_max": 0.009131942759267986, "clip_ratio/high_mean": 0.0022829856898169965, "clip_ratio/low_mean": 0.0019532304140739143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00423621607478708, "entropy": 0.10287509858608246, "epoch": 0.11392, "grad_norm": 0.4850810766220093, "learning_rate": 3.4611462698963144e-06, "loss": 0.0075, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 117.0078125, "completions/mean_terminated_length": 115.91338348388672, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.09922660887241364, "epoch": 0.114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4610491941892758e-06, "loss": 0.0, "num_tokens": 59369054.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10290862247347832, "epoch": 0.11408, "grad_norm": 0.0, "learning_rate": 3.4609519987268654e-06, "loss": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 99.515625, "completions/mean_terminated_length": 99.515625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.13665099442005157, "epoch": 0.11416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4608546835158856e-06, "loss": 0.0, "num_tokens": 59447328.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12569760158658028, "epoch": 0.11424, "grad_norm": 0.0, "learning_rate": 3.4607572485631485e-06, "loss": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 107.640625, "completions/mean_terminated_length": 107.640625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.11617914587259293, "epoch": 0.11432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4606596938754737e-06, "loss": 0.0, "num_tokens": 59526642.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11512690037488937, "epoch": 0.1144, "grad_norm": 0.0, "learning_rate": 3.4605620194596872e-06, "loss": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 101.2421875, "completions/mean_terminated_length": 101.2421875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.13150369375944138, "epoch": 0.11448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.460464225322627e-06, "loss": 0.0, "num_tokens": 59605137.0, "reward": 0.46803462505340576, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46803462505340576, "rewards/reward_fn/std": 0.9913958311080933, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13494587689638138, "epoch": 0.11456, "grad_norm": 0.0, "learning_rate": 3.460366311471137e-06, "loss": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 121.5390625, "completions/mean_terminated_length": 120.48031616210938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.11762598156929016, "epoch": 0.11464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4602682779120696e-06, "loss": 0.0, "num_tokens": 59686230.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10980403050780296, "epoch": 0.11472, "grad_norm": 0.0, "learning_rate": 3.4601701246522865e-06, "loss": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 101.453125, "completions/mean_terminated_length": 101.453125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.12990939617156982, "epoch": 0.1148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4600718516986584e-06, "loss": 0.0, "num_tokens": 59764752.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13756988942623138, "epoch": 0.11488, "grad_norm": 0.0, "learning_rate": 3.4599734590580614e-06, "loss": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 96.859375, "completions/mean_terminated_length": 96.859375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.14695914834737778, "epoch": 0.11496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4598749467373838e-06, "loss": 0.0, "num_tokens": 59842686.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14885981380939484, "epoch": 0.11504, "grad_norm": 0.0, "learning_rate": 3.459776314743519e-06, "loss": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 114.59375, "completions/mean_terminated_length": 114.59375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.12188331037759781, "epoch": 0.11512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4596775630833715e-06, "loss": 0.0, "num_tokens": 59922890.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1149609163403511, "epoch": 0.1152, "grad_norm": 0.0, "learning_rate": 3.4595786917638515e-06, "loss": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 90.828125, "completions/mean_terminated_length": 85.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17984844744205475, "epoch": 0.11528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.45947970079188e-06, "loss": 0.0, "num_tokens": 60000052.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.150923952460289, "epoch": 0.11536, "grad_norm": 0.0, "learning_rate": 3.4593805901743846e-06, "loss": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 114.03125, "completions/mean_terminated_length": 104.5666732788086, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.18367621302604675, "epoch": 0.11544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4592813599183026e-06, "loss": 0.0, "num_tokens": 60080184.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18414772301912308, "epoch": 0.11552, "grad_norm": 0.0, "learning_rate": 3.4591820100305788e-06, "loss": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 117.546875, "completions/mean_terminated_length": 109.53718566894531, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15306276082992554, "epoch": 0.1156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.459082540518167e-06, "loss": 0.0, "num_tokens": 60160766.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1530698537826538, "epoch": 0.11568, "grad_norm": 0.0, "learning_rate": 3.4589829513880283e-06, "loss": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 110.8359375, "completions/mean_terminated_length": 103.69671630859375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.15565698593854904, "epoch": 0.11576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.458883242647133e-06, "loss": 0.0, "num_tokens": 60240489.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15796495974063873, "epoch": 0.11584, "grad_norm": 0.0, "learning_rate": 3.4587834143024606e-06, "loss": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 110.703125, "completions/mean_terminated_length": 110.703125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.14684555679559708, "epoch": 0.11592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4586834663609974e-06, "loss": 0.0, "num_tokens": 60320195.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15567582100629807, "epoch": 0.116, "grad_norm": 0.0, "learning_rate": 3.4585833988297377e-06, "loss": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 100.2421875, "completions/mean_terminated_length": 95.21773529052734, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16430149972438812, "epoch": 0.11608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.458483211715687e-06, "loss": 0.0, "num_tokens": 60398562.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15543138980865479, "epoch": 0.11616, "grad_norm": 0.0, "learning_rate": 3.4583829050258565e-06, "loss": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 114.2734375, "completions/mean_terminated_length": 107.30327606201172, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.14880726486444473, "epoch": 0.11624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.458282478767266e-06, "loss": 0.0, "num_tokens": 60478725.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17766378819942474, "epoch": 0.11632, "grad_norm": 0.0, "learning_rate": 3.4581819329469454e-06, "loss": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 111.4453125, "completions/mean_terminated_length": 105.56909942626953, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.1454647332429886, "epoch": 0.1164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.458081267571931e-06, "loss": 0.0, "num_tokens": 60558526.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15822087228298187, "epoch": 0.11648, "grad_norm": 0.0, "learning_rate": 3.4579804826492684e-06, "loss": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 112.21875, "completions/mean_terminated_length": 107.58064270019531, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.15295253694057465, "epoch": 0.11656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4578795781860125e-06, "loss": 0.0, "num_tokens": 60638426.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17206431180238724, "epoch": 0.11664, "grad_norm": 0.0, "learning_rate": 3.4577785541892243e-06, "loss": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 117.8125, "completions/mean_terminated_length": 107.3613510131836, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.19342119991779327, "epoch": 0.11672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4576774106659745e-06, "loss": 0.0, "num_tokens": 60719042.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16972249746322632, "epoch": 0.1168, "grad_norm": 0.0, "learning_rate": 3.4575761476233427e-06, "loss": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 124.5859375, "completions/mean_terminated_length": 109.73043060302734, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.1747571974992752, "epoch": 0.11688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.457474765068416e-06, "loss": 0.0, "num_tokens": 60800525.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18448807299137115, "epoch": 0.11696, "grad_norm": 0.0, "learning_rate": 3.4573732630082906e-06, "loss": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 140.5390625, "completions/mean_terminated_length": 115.24762725830078, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.18650325387716293, "epoch": 0.11704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.457271641450069e-06, "loss": 0.0, "num_tokens": 60884050.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19240184128284454, "epoch": 0.11712, "grad_norm": 0.0, "learning_rate": 3.4571699004008657e-06, "loss": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 110.296875, "completions/mean_terminated_length": 105.59677124023438, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.17164620012044907, "epoch": 0.1172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4570680398678e-06, "loss": 0.0, "num_tokens": 60963704.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15694577991962433, "epoch": 0.11728, "grad_norm": 0.0, "learning_rate": 3.4569660598580015e-06, "loss": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 117.4921875, "completions/mean_terminated_length": 99.10619354248047, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1815410703420639, "epoch": 0.11736, "frac_reward_zero_std": 0.875, "grad_norm": 0.9290214776992798, "learning_rate": 3.456863960378608e-06, "loss": 0.0505, "num_tokens": 61044279.0, "reward": -0.0234375, "reward_std": 0.09375, "rewards/reward_fn/mean": -0.0234375, "rewards/reward_fn/std": 0.2651650309562683, "step": 1467 }, { "clip_ratio/high_max": 0.014688342344015837, "clip_ratio/high_mean": 0.003672085586003959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003672085586003959, "entropy": 0.2072564959526062, "epoch": 0.11744, "grad_norm": 0.1075846254825592, "learning_rate": 3.456761741436765e-06, "loss": -0.0098, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 106.71428680419922, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.19011501967906952, "epoch": 0.11752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.456659403039627e-06, "loss": 0.0, "num_tokens": 61125863.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19718527793884277, "epoch": 0.1176, "grad_norm": 0.0, "learning_rate": 3.4565569451943564e-06, "loss": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 113.0714340209961, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.2039046734571457, "epoch": 0.11768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4564543679081244e-06, "loss": 0.0, "num_tokens": 61208159.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23337800800800323, "epoch": 0.11776, "grad_norm": 0.0, "learning_rate": 3.45635167118811e-06, "loss": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 159.9765625, "completions/mean_terminated_length": 138.94285583496094, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.22491876780986786, "epoch": 0.11784, "frac_reward_zero_std": 0.75, "grad_norm": 1.1921941041946411, "learning_rate": 3.4562488550415016e-06, "loss": 0.0358, "num_tokens": 61294172.0, "reward": 0.3046875, "reward_std": 0.2218368798494339, "rewards/reward_fn/mean": 0.3046875, "rewards/reward_fn/std": 1.1193262338638306, "step": 1473 }, { "clip_ratio/high_max": 0.011373680550605059, "clip_ratio/high_mean": 0.005090715596452355, "clip_ratio/low_mean": 0.002872619777917862, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007963335607200861, "entropy": 0.26249493658542633, "epoch": 0.11792, "grad_norm": 0.26284101605415344, "learning_rate": 3.4561459194754943e-06, "loss": -0.0107, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.59375, "completions/mean_terminated_length": 204.34483337402344, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.30351924896240234, "epoch": 0.118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.456042864497293e-06, "loss": 0.0, "num_tokens": 61389480.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31254398822784424, "epoch": 0.11808, "grad_norm": 0.0, "learning_rate": 3.4559396901141106e-06, "loss": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 241.4296875, "completions/mean_terminated_length": 217.9387664794922, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2601802498102188, "epoch": 0.11816, "frac_reward_zero_std": 0.5, "grad_norm": 0.41120052337646484, "learning_rate": 3.4558363963331676e-06, "loss": 0.0629, "num_tokens": 61485919.0, "reward": 0.140625, "reward_std": 0.5030868649482727, "rewards/reward_fn/mean": 0.140625, "rewards/reward_fn/std": 1.3500328063964844, "step": 1477 }, { "clip_ratio/high_max": 0.02487027458846569, "clip_ratio/high_mean": 0.012687698937952518, "clip_ratio/low_mean": 0.0009584127692505717, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013646111823618412, "entropy": 0.2638261169195175, "epoch": 0.11824, "grad_norm": 0.3604060113430023, "learning_rate": 3.4557329831616945e-06, "loss": -0.0553, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 225.52381896972656, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.10524195805191994, "epoch": 0.11832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4556294506069282e-06, "loss": 0.0, "num_tokens": 61582303.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0815538801252842, "epoch": 0.1184, "grad_norm": 0.0, "learning_rate": 3.4555257986761152e-06, "loss": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.5546875, "completions/mean_terminated_length": 183.440673828125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07632943242788315, "epoch": 0.11848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.45542202737651e-06, "loss": 0.0, "num_tokens": 61676326.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07142451032996178, "epoch": 0.11856, "grad_norm": 0.0, "learning_rate": 3.4553181367153756e-06, "loss": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.65625, "completions/mean_terminated_length": 223.8441619873047, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06698020175099373, "epoch": 0.11864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.455214126699983e-06, "loss": 0.0, "num_tokens": 61772154.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06892156973481178, "epoch": 0.11872, "grad_norm": 0.0, "learning_rate": 3.4551099973376124e-06, "loss": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 201.59375, "completions/mean_terminated_length": 190.30189514160156, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06287827715277672, "epoch": 0.1188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4550057486355513e-06, "loss": 0.0, "num_tokens": 61863494.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061454636976122856, "epoch": 0.11888, "grad_norm": 0.0, "learning_rate": 3.4549013806010957e-06, "loss": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 207.421875, "completions/mean_terminated_length": 198.42593383789062, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06273863278329372, "epoch": 0.11896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4547968932415505e-06, "loss": 0.0, "num_tokens": 61955580.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06229061633348465, "epoch": 0.11904, "grad_norm": 0.0, "learning_rate": 3.4546922865642293e-06, "loss": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 196.9765625, "completions/mean_terminated_length": 196.9765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.05673958361148834, "epoch": 0.11912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4545875605764535e-06, "loss": 0.0, "num_tokens": 62046329.0, "reward": 0.7953383922576904, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7953383922576904, "rewards/reward_fn/std": 1.28325617313385, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05652596428990364, "epoch": 0.1192, "grad_norm": 0.0, "learning_rate": 3.454482715285551e-06, "loss": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 207.390625, "completions/mean_terminated_length": 201.89564514160156, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.0584364403039217, "epoch": 0.11928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.454377750698862e-06, "loss": 0.0, "num_tokens": 62138411.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05993151664733887, "epoch": 0.11936, "grad_norm": 0.0, "learning_rate": 3.4542726668237318e-06, "loss": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 186.3046875, "completions/mean_terminated_length": 183.47154235839844, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.05629575252532959, "epoch": 0.11944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4541674636675155e-06, "loss": 0.0, "num_tokens": 62227794.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059998420998454094, "epoch": 0.11952, "grad_norm": 0.0, "learning_rate": 3.4540621412375765e-06, "loss": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 201.234375, "completions/mean_terminated_length": 191.09259033203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06218028999865055, "epoch": 0.1196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4539566995412854e-06, "loss": 0.0, "num_tokens": 62319088.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06220996007323265, "epoch": 0.11968, "grad_norm": 0.0, "learning_rate": 3.4538511385860228e-06, "loss": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 185.984375, "completions/mean_terminated_length": 183.13819885253906, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.05501028709113598, "epoch": 0.11976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.453745458379177e-06, "loss": 0.0, "num_tokens": 62408430.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.058018144220113754, "epoch": 0.11984, "grad_norm": 0.0, "learning_rate": 3.453639658928143e-06, "loss": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 170.8515625, "completions/mean_terminated_length": 170.8515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.05930749699473381, "epoch": 0.11992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4535337402403274e-06, "loss": 0.0, "num_tokens": 62495835.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06041816063225269, "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 3.4534277023231424e-06, "loss": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 189.8515625, "completions/mean_terminated_length": 189.33070373535156, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.06489469483494759, "epoch": 0.12008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.45332154518401e-06, "loss": 0.0, "num_tokens": 62585672.0, "reward": 0.08830241858959198, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08830241858959198, "rewards/reward_fn/std": 0.23454421758651733, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06256983056664467, "epoch": 0.12016, "grad_norm": 0.0, "learning_rate": 3.4532152688303597e-06, "loss": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 199.6796875, "completions/mean_terminated_length": 193.3130340576172, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06682486459612846, "epoch": 0.12024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4531088732696304e-06, "loss": 0.0, "num_tokens": 62676767.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06542772054672241, "epoch": 0.12032, "grad_norm": 0.0, "learning_rate": 3.453002358509268e-06, "loss": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 188.59375, "completions/mean_terminated_length": 181.6206817626953, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06853879615664482, "epoch": 0.1204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4528957245567274e-06, "loss": 0.0, "num_tokens": 62766443.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06497448682785034, "epoch": 0.12048, "grad_norm": 0.0, "learning_rate": 3.4527889714194724e-06, "loss": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 186.4765625, "completions/mean_terminated_length": 184.80801391601562, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.049825992435216904, "epoch": 0.12056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.452682099104974e-06, "loss": 0.0, "num_tokens": 62855848.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05259707011282444, "epoch": 0.12064, "grad_norm": 0.0, "learning_rate": 3.4525751076207123e-06, "loss": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 189.859375, "completions/mean_terminated_length": 184.25424194335938, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.05663658678531647, "epoch": 0.12072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4524679969741763e-06, "loss": 0.0, "num_tokens": 62945686.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05753630958497524, "epoch": 0.1208, "grad_norm": 0.0, "learning_rate": 3.4523607671728615e-06, "loss": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 198.328125, "completions/mean_terminated_length": 195.98373413085938, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.06257819198071957, "epoch": 0.12088, "frac_reward_zero_std": 0.875, "grad_norm": 0.473859578371048, "learning_rate": 3.452253418224273e-06, "loss": -0.0236, "num_tokens": 63036608.0, "reward": -0.328125, "reward_std": 0.1280868798494339, "rewards/reward_fn/mean": -0.328125, "rewards/reward_fn/std": 0.9400064945220947, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00723253539763391, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00723253539763391, "entropy": 0.061252983286976814, "epoch": 0.12096, "grad_norm": 0.11899473518133163, "learning_rate": 3.4521459501359246e-06, "loss": 0.0242, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 237.8203125, "completions/mean_terminated_length": 229.8539276123047, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.055861422792077065, "epoch": 0.12104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.452038362915338e-06, "loss": 0.0, "num_tokens": 63132585.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060817936435341835, "epoch": 0.12112, "grad_norm": 0.0, "learning_rate": 3.4519306565700428e-06, "loss": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 222.59341430664062, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.05645294301211834, "epoch": 0.1212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4518228311075775e-06, "loss": 0.0, "num_tokens": 63227849.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057829972356557846, "epoch": 0.12128, "grad_norm": 0.0, "learning_rate": 3.4517148865354886e-06, "loss": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.1640625, "completions/mean_terminated_length": 230.12037658691406, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06194847449660301, "epoch": 0.12136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.451606822861331e-06, "loss": 0.0, "num_tokens": 63323358.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06348313391208649, "epoch": 0.12144, "grad_norm": 0.0, "learning_rate": 3.4514986400926684e-06, "loss": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 219.90625, "completions/mean_terminated_length": 217.17648315429688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06327307969331741, "epoch": 0.12152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4513903382370722e-06, "loss": 0.0, "num_tokens": 63417042.0, "reward": 0.860668957233429, "reward_std": 0.0, "rewards/reward_fn/mean": 0.860668957233429, "rewards/reward_fn/std": 1.2727471590042114, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06225424259901047, "epoch": 0.1216, "grad_norm": 0.0, "learning_rate": 3.4512819173021225e-06, "loss": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.921875, "completions/mean_terminated_length": 221.4958038330078, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06859876215457916, "epoch": 0.12168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.451173377295407e-06, "loss": 0.0, "num_tokens": 63511240.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07041368260979652, "epoch": 0.12176, "grad_norm": 0.0, "learning_rate": 3.451064718224523e-06, "loss": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.34375, "completions/mean_terminated_length": 217.22727966308594, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06600522994995117, "epoch": 0.12184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.450955940097076e-06, "loss": 0.0, "num_tokens": 63606132.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06560098007321358, "epoch": 0.12192, "grad_norm": 0.0, "learning_rate": 3.450847042920678e-06, "loss": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.6015625, "completions/mean_terminated_length": 198.6752166748047, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.061225639656186104, "epoch": 0.122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.450738026702952e-06, "loss": 0.0, "num_tokens": 63697729.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059869520366191864, "epoch": 0.12208, "grad_norm": 0.0, "learning_rate": 3.450628891451527e-06, "loss": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.7109375, "completions/mean_terminated_length": 201.49038696289062, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06416740082204342, "epoch": 0.12216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4505196371740414e-06, "loss": 0.0, "num_tokens": 63790364.0, "reward": 0.8330045938491821, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8330045938491821, "rewards/reward_fn/std": 1.2743265628814697, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06179231032729149, "epoch": 0.12224, "grad_norm": 0.0, "learning_rate": 3.450410263878142e-06, "loss": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 201.9453125, "completions/mean_terminated_length": 191.33644104003906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07168617844581604, "epoch": 0.12232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4503007715714844e-06, "loss": 0.0, "num_tokens": 63881749.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07275807857513428, "epoch": 0.1224, "grad_norm": 0.0, "learning_rate": 3.450191160261731e-06, "loss": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.8515625, "completions/mean_terminated_length": 237.9367218017578, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.0667712427675724, "epoch": 0.12248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.450081429956554e-06, "loss": 0.0, "num_tokens": 63978626.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06622674316167831, "epoch": 0.12256, "grad_norm": 0.0, "learning_rate": 3.4499715806636328e-06, "loss": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.9609375, "completions/mean_terminated_length": 224.31765747070312, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.07257654890418053, "epoch": 0.12264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.449861612390656e-06, "loss": 0.0, "num_tokens": 64074237.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07146812975406647, "epoch": 0.12272, "grad_norm": 0.0, "learning_rate": 3.449751525145321e-06, "loss": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.4765625, "completions/mean_terminated_length": 206.44346618652344, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07633886858820915, "epoch": 0.1228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.449641318935331e-06, "loss": 0.0, "num_tokens": 64166842.0, "reward": 0.8526300191879272, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8526300191879272, "rewards/reward_fn/std": 1.2727689743041992, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07246232777833939, "epoch": 0.12288, "grad_norm": 0.0, "learning_rate": 3.449530993768401e-06, "loss": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.2265625, "completions/mean_terminated_length": 220.72152709960938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.05860567465424538, "epoch": 0.12296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4494205496522516e-06, "loss": 0.0, "num_tokens": 64262359.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060271067544817924, "epoch": 0.12304, "grad_norm": 0.0, "learning_rate": 3.449309986594613e-06, "loss": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.7890625, "completions/mean_terminated_length": 206.24285888671875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06779700517654419, "epoch": 0.12312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4491993046032234e-06, "loss": 0.0, "num_tokens": 64357180.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07134740054607391, "epoch": 0.1232, "grad_norm": 0.0, "learning_rate": 3.44908850368583e-06, "loss": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.4765625, "completions/mean_terminated_length": 238.38462829589844, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06346551328897476, "epoch": 0.12328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4489775838501865e-06, "loss": 0.0, "num_tokens": 64453881.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06426475569605827, "epoch": 0.12336, "grad_norm": 0.0, "learning_rate": 3.4488665451040565e-06, "loss": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.3046875, "completions/mean_terminated_length": 240.2911376953125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.0663563683629036, "epoch": 0.12344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4487553874552124e-06, "loss": 0.0, "num_tokens": 64550944.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06782907247543335, "epoch": 0.12352, "grad_norm": 0.0, "learning_rate": 3.4486441109114333e-06, "loss": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.1640625, "completions/mean_terminated_length": 225.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07337246462702751, "epoch": 0.1236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4485327154805077e-06, "loss": 0.0, "num_tokens": 64647605.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07202625647187233, "epoch": 0.12368, "grad_norm": 0.0, "learning_rate": 3.448421201170231e-06, "loss": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.09375, "completions/mean_terminated_length": 210.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07387700304389, "epoch": 0.12376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4483095679884097e-06, "loss": 0.0, "num_tokens": 64742977.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07419203594326973, "epoch": 0.12384, "grad_norm": 0.0, "learning_rate": 3.448197815942856e-06, "loss": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 208.7421875, "completions/mean_terminated_length": 183.12046813964844, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06330282613635063, "epoch": 0.12392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4480859450413913e-06, "loss": 0.0, "num_tokens": 64835232.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06634881347417831, "epoch": 0.124, "grad_norm": 0.0, "learning_rate": 3.4479739552918464e-06, "loss": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.546875, "completions/mean_terminated_length": 207.0450439453125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07433029264211655, "epoch": 0.12408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4478618467020577e-06, "loss": 0.0, "num_tokens": 64928102.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07159438356757164, "epoch": 0.12416, "grad_norm": 0.0, "learning_rate": 3.447749619279873e-06, "loss": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.6796875, "completions/mean_terminated_length": 199.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07985683158040047, "epoch": 0.12424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4476372730331468e-06, "loss": 0.0, "num_tokens": 65019965.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0778413824737072, "epoch": 0.12432, "grad_norm": 0.0, "learning_rate": 3.4475248079697413e-06, "loss": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.984375, "completions/mean_terminated_length": 211.04254150390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06818721443414688, "epoch": 0.1244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.447412224097529e-06, "loss": 0.0, "num_tokens": 65114043.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06653439626097679, "epoch": 0.12448, "grad_norm": 0.0, "learning_rate": 3.4472995214243888e-06, "loss": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.953125, "completions/mean_terminated_length": 219.78823852539062, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07165970653295517, "epoch": 0.12456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.447186699958209e-06, "loss": 0.0, "num_tokens": 65209269.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07377783954143524, "epoch": 0.12464, "grad_norm": 0.0, "learning_rate": 3.447073759706886e-06, "loss": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.0703125, "completions/mean_terminated_length": 217.2278594970703, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06952465325593948, "epoch": 0.12472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4469607006783243e-06, "loss": 0.0, "num_tokens": 65304510.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07152506709098816, "epoch": 0.1248, "grad_norm": 0.0, "learning_rate": 3.446847522880437e-06, "loss": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.71875, "completions/mean_terminated_length": 213.4091033935547, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07013571262359619, "epoch": 0.12488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4467342263211447e-06, "loss": 0.0, "num_tokens": 65399066.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045596837997437, "epoch": 0.12496, "grad_norm": 0.0, "learning_rate": 3.446620811008378e-06, "loss": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.5546875, "completions/mean_terminated_length": 234.37998962402344, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.06788037717342377, "epoch": 0.12504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4465072769500737e-06, "loss": 0.0, "num_tokens": 65496289.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07048505917191505, "epoch": 0.12512, "grad_norm": 0.0, "learning_rate": 3.446393624154179e-06, "loss": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 212.6602020263672, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.05846274830400944, "epoch": 0.1252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4462798526286476e-06, "loss": 0.0, "num_tokens": 65590129.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06097453460097313, "epoch": 0.12528, "grad_norm": 0.0, "learning_rate": 3.4461659623814424e-06, "loss": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.328125, "completions/mean_terminated_length": 232.2888946533203, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06719490140676498, "epoch": 0.12536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4460519534205354e-06, "loss": 0.0, "num_tokens": 65686299.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664333663880825, "epoch": 0.12544, "grad_norm": 0.0, "learning_rate": 3.445937825753905e-06, "loss": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.90625, "completions/mean_terminated_length": 216.13792419433594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07072482630610466, "epoch": 0.12552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4458235793895392e-06, "loss": 0.0, "num_tokens": 65781135.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07230367511510849, "epoch": 0.1256, "grad_norm": 0.0, "learning_rate": 3.4457092143354344e-06, "loss": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.8828125, "completions/mean_terminated_length": 193.44871520996094, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.0650310255587101, "epoch": 0.12568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4455947305995945e-06, "loss": 0.0, "num_tokens": 65874560.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06671491637825966, "epoch": 0.12576, "grad_norm": 0.0, "learning_rate": 3.445480128190032e-06, "loss": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.8046875, "completions/mean_terminated_length": 234.84693908691406, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06692919135093689, "epoch": 0.12584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.445365407114769e-06, "loss": 0.0, "num_tokens": 65970791.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06763490289449692, "epoch": 0.12592, "grad_norm": 0.0, "learning_rate": 3.445250567381833e-06, "loss": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.4375, "completions/mean_terminated_length": 213.478271484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06844192370772362, "epoch": 0.126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4451356089992632e-06, "loss": 0.0, "num_tokens": 66065183.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06900092214345932, "epoch": 0.12608, "grad_norm": 0.0, "learning_rate": 3.4450205319751044e-06, "loss": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.03125, "completions/mean_terminated_length": 218.5066680908203, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07352141663432121, "epoch": 0.12616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4449053363174113e-06, "loss": 0.0, "num_tokens": 66160675.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07395246624946594, "epoch": 0.12624, "grad_norm": 0.0, "learning_rate": 3.4447900220342464e-06, "loss": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.3515625, "completions/mean_terminated_length": 237.28036499023438, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.06049327552318573, "epoch": 0.12632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.44467458913368e-06, "loss": 0.0, "num_tokens": 66256976.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06360585242509842, "epoch": 0.1264, "grad_norm": 0.0, "learning_rate": 3.444559037623792e-06, "loss": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.921875, "completions/mean_terminated_length": 184.09091186523438, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07117344811558723, "epoch": 0.12648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4444433675126683e-06, "loss": 0.0, "num_tokens": 66350534.0, "reward": 0.7524996995925903, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7524996995925903, "rewards/reward_fn/std": 1.302709698677063, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07472957298159599, "epoch": 0.12656, "grad_norm": 0.0, "learning_rate": 3.4443275788084066e-06, "loss": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.34375, "completions/mean_terminated_length": 236.243896484375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06860201805830002, "epoch": 0.12664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4442116715191093e-06, "loss": 0.0, "num_tokens": 66447218.0, "reward": 0.4792068302631378, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4792068302631378, "rewards/reward_fn/std": 0.9949710369110107, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07029742002487183, "epoch": 0.12672, "grad_norm": 0.0, "learning_rate": 3.44409564565289e-06, "loss": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.828125, "completions/mean_terminated_length": 234.94595336914062, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07153256237506866, "epoch": 0.1268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.443979501217868e-06, "loss": 0.0, "num_tokens": 66543964.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07266123220324516, "epoch": 0.12688, "grad_norm": 0.0, "learning_rate": 3.4438632382221725e-06, "loss": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.078125, "completions/mean_terminated_length": 211.8205108642578, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07291487604379654, "epoch": 0.12696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4437468566739412e-06, "loss": 0.0, "num_tokens": 66638822.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07726141065359116, "epoch": 0.12704, "grad_norm": 0.0, "learning_rate": 3.443630356581319e-06, "loss": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 224.87322998046875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.0674537867307663, "epoch": 0.12712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4435137379524605e-06, "loss": 0.0, "num_tokens": 66734916.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06993604451417923, "epoch": 0.1272, "grad_norm": 0.0, "learning_rate": 3.4433970007955266e-06, "loss": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.78125, "completions/mean_terminated_length": 230.36842346191406, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.0782642625272274, "epoch": 0.12728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4432801451186884e-06, "loss": 0.0, "num_tokens": 66831272.0, "reward": 0.4971176087856293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4971176087856293, "rewards/reward_fn/std": 0.9702772498130798, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07649056613445282, "epoch": 0.12736, "grad_norm": 0.0, "learning_rate": 3.4431631709301247e-06, "loss": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 241.1428680419922, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06027054600417614, "epoch": 0.12744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.443046078238022e-06, "loss": 0.0, "num_tokens": 66928328.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06315869092941284, "epoch": 0.12752, "grad_norm": 0.0, "learning_rate": 3.4429288670505762e-06, "loss": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.484375, "completions/mean_terminated_length": 218.07894897460938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06631103903055191, "epoch": 0.1276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.44281153737599e-06, "loss": 0.0, "num_tokens": 67023750.0, "reward": 0.47843146324157715, "reward_std": 0.0, "rewards/reward_fn/mean": 0.47843146324157715, "rewards/reward_fn/std": 0.9946947693824768, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06907488778233528, "epoch": 0.12768, "grad_norm": 0.0, "learning_rate": 3.4426940892224763e-06, "loss": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.65625, "completions/mean_terminated_length": 231.60440063476562, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07013101503252983, "epoch": 0.12776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4425765225982535e-06, "loss": 0.0, "num_tokens": 67119834.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07116332650184631, "epoch": 0.12784, "grad_norm": 0.0, "learning_rate": 3.442458837511552e-06, "loss": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 200.921875, "completions/mean_terminated_length": 188.2115478515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07883193343877792, "epoch": 0.12792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.442341033970607e-06, "loss": 0.0, "num_tokens": 67211088.0, "reward": 1.1636884212493896, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1636884212493896, "rewards/reward_fn/std": 1.4313664436340332, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07907514274120331, "epoch": 0.128, "grad_norm": 0.0, "learning_rate": 3.4422231119836644e-06, "loss": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.828125, "completions/mean_terminated_length": 202.65956115722656, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06742135435342789, "epoch": 0.12808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4421050715589773e-06, "loss": 0.0, "num_tokens": 67304378.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06666234880685806, "epoch": 0.12816, "grad_norm": 0.0, "learning_rate": 3.441986912704807e-06, "loss": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.9140625, "completions/mean_terminated_length": 221.5773162841797, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07505139708518982, "epoch": 0.12824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.441868635429424e-06, "loss": 0.0, "num_tokens": 67399343.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07393203303217888, "epoch": 0.12832, "grad_norm": 0.0, "learning_rate": 3.441750239741106e-06, "loss": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.5546875, "completions/mean_terminated_length": 215.3434295654297, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.0811232402920723, "epoch": 0.1284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.441631725648139e-06, "loss": 0.0, "num_tokens": 67493622.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0822780691087246, "epoch": 0.12848, "grad_norm": 0.0, "learning_rate": 3.4415130931588186e-06, "loss": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 244.5625, "completions/mean_terminated_length": 235.08570861816406, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06928539276123047, "epoch": 0.12856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4413943422814472e-06, "loss": 0.0, "num_tokens": 67590462.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06921754032373428, "epoch": 0.12864, "grad_norm": 0.0, "learning_rate": 3.4412754730243364e-06, "loss": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.9921875, "completions/mean_terminated_length": 218.21310424804688, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06662565097212791, "epoch": 0.12872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4411564853958056e-06, "loss": 0.0, "num_tokens": 67686461.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06861399114131927, "epoch": 0.1288, "grad_norm": 0.0, "learning_rate": 3.441037379404183e-06, "loss": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.2578125, "completions/mean_terminated_length": 214.13829040527344, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06484746560454369, "epoch": 0.12888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4409181550578043e-06, "loss": 0.0, "num_tokens": 67780830.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06189859285950661, "epoch": 0.12896, "grad_norm": 0.0, "learning_rate": 3.4407988123650145e-06, "loss": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.5703125, "completions/mean_terminated_length": 241.63095092773438, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06418554857373238, "epoch": 0.12904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.440679351334166e-06, "loss": 0.0, "num_tokens": 67877927.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06585334613919258, "epoch": 0.12912, "grad_norm": 0.0, "learning_rate": 3.44055977197362e-06, "loss": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 214.32557678222656, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06332224234938622, "epoch": 0.1292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4404400742917453e-06, "loss": 0.0, "num_tokens": 67972647.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0638904720544815, "epoch": 0.12928, "grad_norm": 0.0, "learning_rate": 3.44032025829692e-06, "loss": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.7734375, "completions/mean_terminated_length": 233.1216278076172, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.0638272576034069, "epoch": 0.12936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4402003239975296e-06, "loss": 0.0, "num_tokens": 68069258.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06812884658575058, "epoch": 0.12944, "grad_norm": 0.0, "learning_rate": 3.440080271401969e-06, "loss": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 233.74713134765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.0700785331428051, "epoch": 0.12952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.439960100518639e-06, "loss": 0.0, "num_tokens": 68165626.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.069845300167799, "epoch": 0.1296, "grad_norm": 0.0, "learning_rate": 3.4398398113559527e-06, "loss": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.7265625, "completions/mean_terminated_length": 220.10842895507812, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06981916725635529, "epoch": 0.12968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4397194039223266e-06, "loss": 0.0, "num_tokens": 68260951.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07056078314781189, "epoch": 0.12976, "grad_norm": 0.0, "learning_rate": 3.439598878226189e-06, "loss": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.4765625, "completions/mean_terminated_length": 186.3625030517578, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06516049057245255, "epoch": 0.12984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.439478234275976e-06, "loss": 0.0, "num_tokens": 68353684.0, "reward": 0.4472954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4472954273223877, "rewards/reward_fn/std": 0.9870926737785339, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06549229100346565, "epoch": 0.12992, "grad_norm": 0.0, "learning_rate": 3.4393574720801305e-06, "loss": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.9375, "completions/mean_terminated_length": 233.40660095214844, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06483650580048561, "epoch": 0.13, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.439236591647105e-06, "loss": 0.0, "num_tokens": 68449932.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06703928858041763, "epoch": 0.13008, "grad_norm": 0.0, "learning_rate": 3.4391155929853603e-06, "loss": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.03125, "completions/mean_terminated_length": 207.6216278076172, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07321789115667343, "epoch": 0.13016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.438994476103364e-06, "loss": 0.0, "num_tokens": 68544656.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07135751098394394, "epoch": 0.13024, "grad_norm": 0.0, "learning_rate": 3.4388732410095935e-06, "loss": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.9453125, "completions/mean_terminated_length": 241.32911682128906, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.068378746509552, "epoch": 0.13032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.438751887712534e-06, "loss": 0.0, "num_tokens": 68641801.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0690118595957756, "epoch": 0.1304, "grad_norm": 0.0, "learning_rate": 3.4386304162206794e-06, "loss": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.078125, "completions/mean_terminated_length": 220.2933349609375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07517102360725403, "epoch": 0.13048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4385088265425308e-06, "loss": 0.0, "num_tokens": 68737427.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08001108095049858, "epoch": 0.13056, "grad_norm": 0.0, "learning_rate": 3.4383871186865978e-06, "loss": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.796875, "completions/mean_terminated_length": 241.59375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06864126026630402, "epoch": 0.13064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4382652926614002e-06, "loss": 0.0, "num_tokens": 68834809.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06761198490858078, "epoch": 0.13072, "grad_norm": 0.0, "learning_rate": 3.4381433484754628e-06, "loss": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.3359375, "completions/mean_terminated_length": 218.43283081054688, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06669347733259201, "epoch": 0.1308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4380212861373217e-06, "loss": 0.0, "num_tokens": 68930596.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07034159079194069, "epoch": 0.13088, "grad_norm": 0.0, "learning_rate": 3.437899105655519e-06, "loss": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.8515625, "completions/mean_terminated_length": 239.7841033935547, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06548749655485153, "epoch": 0.13096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4377768070386063e-06, "loss": 0.0, "num_tokens": 69027473.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06537442654371262, "epoch": 0.13104, "grad_norm": 0.0, "learning_rate": 3.4376543902951436e-06, "loss": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 211.21949768066406, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07153027877211571, "epoch": 0.13112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4375318554336992e-06, "loss": 0.0, "num_tokens": 69122105.0, "reward": 0.4999999403953552, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4999999403953552, "rewards/reward_fn/std": 1.0039292573928833, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06999137625098228, "epoch": 0.1312, "grad_norm": 0.0, "learning_rate": 3.437409202462848e-06, "loss": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.6953125, "completions/mean_terminated_length": 224.72152709960938, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.06871918961405754, "epoch": 0.13128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.437286431391175e-06, "loss": 0.0, "num_tokens": 69217938.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06782825663685799, "epoch": 0.13136, "grad_norm": 0.0, "learning_rate": 3.4371635422272733e-06, "loss": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.6328125, "completions/mean_terminated_length": 237.5098114013672, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06825791671872139, "epoch": 0.13144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4370405349797433e-06, "loss": 0.0, "num_tokens": 69315299.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07058708742260933, "epoch": 0.13152, "grad_norm": 0.0, "learning_rate": 3.4369174096571942e-06, "loss": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7890625, "completions/mean_terminated_length": 237.4571533203125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07234476134181023, "epoch": 0.1316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.436794166268244e-06, "loss": 0.0, "num_tokens": 69411656.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07563147321343422, "epoch": 0.13168, "grad_norm": 0.0, "learning_rate": 3.436670804821518e-06, "loss": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.453125, "completions/mean_terminated_length": 222.63999938964844, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06675465032458305, "epoch": 0.13176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.43654732532565e-06, "loss": 0.0, "num_tokens": 69507458.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06897066906094551, "epoch": 0.13184, "grad_norm": 0.0, "learning_rate": 3.4364237277892823e-06, "loss": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.4453125, "completions/mean_terminated_length": 223.2604217529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0701695941388607, "epoch": 0.13192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.436300012221066e-06, "loss": 0.0, "num_tokens": 69602619.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.071939367800951, "epoch": 0.132, "grad_norm": 0.0, "learning_rate": 3.4361761786296593e-06, "loss": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.6953125, "completions/mean_terminated_length": 235.48191833496094, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07028136774897575, "epoch": 0.13208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.43605222702373e-06, "loss": 0.0, "num_tokens": 69699220.0, "reward": 0.43078044056892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.43078044056892395, "rewards/reward_fn/std": 0.9858564734458923, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.072626281529665, "epoch": 0.13216, "grad_norm": 0.0, "learning_rate": 3.4359281574119527e-06, "loss": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.8203125, "completions/mean_terminated_length": 215.9339599609375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06318715214729309, "epoch": 0.13224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.435803969803011e-06, "loss": 0.0, "num_tokens": 69793277.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0638398602604866, "epoch": 0.13232, "grad_norm": 0.0, "learning_rate": 3.435679664205597e-06, "loss": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.4921875, "completions/mean_terminated_length": 229.71580505371094, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.0689171701669693, "epoch": 0.1324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4355552406284104e-06, "loss": 0.0, "num_tokens": 69889084.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0728096105158329, "epoch": 0.13248, "grad_norm": 0.0, "learning_rate": 3.43543069908016e-06, "loss": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.6640625, "completions/mean_terminated_length": 206.21875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07471872121095657, "epoch": 0.13256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4353060395695623e-06, "loss": 0.0, "num_tokens": 69982609.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07419455796480179, "epoch": 0.13264, "grad_norm": 0.0, "learning_rate": 3.435181262105342e-06, "loss": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.1875, "completions/mean_terminated_length": 217.94285583496094, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.080657709389925, "epoch": 0.13272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4350563666962317e-06, "loss": 0.0, "num_tokens": 70078249.0, "reward": 0.08158833533525467, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08158833533525467, "rewards/reward_fn/std": 0.21671062707901, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07076593488454819, "epoch": 0.1328, "grad_norm": 0.0, "learning_rate": 3.434931353350974e-06, "loss": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.6171875, "completions/mean_terminated_length": 198.42391967773438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06390905380249023, "epoch": 0.13288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.434806222078317e-06, "loss": 0.0, "num_tokens": 70171256.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06407913565635681, "epoch": 0.13296, "grad_norm": 0.0, "learning_rate": 3.43468097288702e-06, "loss": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.90625, "completions/mean_terminated_length": 201.2458953857422, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0673639327287674, "epoch": 0.13304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4345556057858483e-06, "loss": 0.0, "num_tokens": 70266220.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06854108348488808, "epoch": 0.13312, "grad_norm": 0.0, "learning_rate": 3.4344301207835767e-06, "loss": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.0703125, "completions/mean_terminated_length": 212.87838745117188, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07566960155963898, "epoch": 0.1332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.434304517888987e-06, "loss": 0.0, "num_tokens": 70361333.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07388827204704285, "epoch": 0.13328, "grad_norm": 0.0, "learning_rate": 3.4341787971108713e-06, "loss": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.296875, "completions/mean_terminated_length": 220.30589294433594, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06908483430743217, "epoch": 0.13336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.434052958458028e-06, "loss": 0.0, "num_tokens": 70456603.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07012546062469482, "epoch": 0.13344, "grad_norm": 0.0, "learning_rate": 3.433927001939264e-06, "loss": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.6328125, "completions/mean_terminated_length": 211.17283630371094, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06737405806779861, "epoch": 0.13352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4338009275633965e-06, "loss": 0.0, "num_tokens": 70551276.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06710585206747055, "epoch": 0.1336, "grad_norm": 0.0, "learning_rate": 3.4336747353392475e-06, "loss": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 218.32911682128906, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06602951139211655, "epoch": 0.13368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.43354842527565e-06, "loss": 0.0, "num_tokens": 70646604.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06813480705022812, "epoch": 0.13376, "grad_norm": 0.0, "learning_rate": 3.433421997381445e-06, "loss": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.1796875, "completions/mean_terminated_length": 232.41334533691406, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.07164055854082108, "epoch": 0.13384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4332954516654803e-06, "loss": 0.0, "num_tokens": 70743139.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07224301993846893, "epoch": 0.13392, "grad_norm": 0.0, "learning_rate": 3.4331687881366127e-06, "loss": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.3828125, "completions/mean_terminated_length": 239.10227966308594, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07056576013565063, "epoch": 0.134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4330420068037073e-06, "loss": 0.0, "num_tokens": 70839956.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07243713736534119, "epoch": 0.13408, "grad_norm": 0.0, "learning_rate": 3.432915107675638e-06, "loss": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.078125, "completions/mean_terminated_length": 237.8441619873047, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.07166145369410515, "epoch": 0.13416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4327880907612863e-06, "loss": 0.0, "num_tokens": 70936862.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06719237565994263, "epoch": 0.13424, "grad_norm": 0.0, "learning_rate": 3.4326609560695414e-06, "loss": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.21875, "completions/mean_terminated_length": 232.35000610351562, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0645129643380642, "epoch": 0.13432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4325337036093027e-06, "loss": 0.0, "num_tokens": 71033274.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06389733776450157, "epoch": 0.1344, "grad_norm": 0.0, "learning_rate": 3.4324063333894746e-06, "loss": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.40625, "completions/mean_terminated_length": 205.68182373046875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06427193991839886, "epoch": 0.13448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.432278845418973e-06, "loss": 0.0, "num_tokens": 71127150.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06489191576838493, "epoch": 0.13456, "grad_norm": 0.0, "learning_rate": 3.432151239706721e-06, "loss": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.8671875, "completions/mean_terminated_length": 238.35227966308594, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.0797584131360054, "epoch": 0.13464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4320235162616486e-06, "loss": 0.0, "num_tokens": 71223901.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07592814788222313, "epoch": 0.13472, "grad_norm": 0.0, "learning_rate": 3.4318956750926957e-06, "loss": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 213.45455932617188, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06636564061045647, "epoch": 0.1348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.43176771620881e-06, "loss": 0.0, "num_tokens": 71318461.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06583115458488464, "epoch": 0.13488, "grad_norm": 0.0, "learning_rate": 3.4316396396189467e-06, "loss": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5234375, "completions/mean_terminated_length": 228.23233032226562, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.06887813657522202, "epoch": 0.13496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.43151144533207e-06, "loss": 0.0, "num_tokens": 71414016.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06911046430468559, "epoch": 0.13504, "grad_norm": 0.0, "learning_rate": 3.4313831333571523e-06, "loss": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.3984375, "completions/mean_terminated_length": 206.96609497070312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07080447673797607, "epoch": 0.13512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4312547037031743e-06, "loss": 0.0, "num_tokens": 71509427.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07271333038806915, "epoch": 0.1352, "grad_norm": 0.0, "learning_rate": 3.431126156379124e-06, "loss": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 192.2421875, "completions/mean_terminated_length": 181.80908203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.08754273131489754, "epoch": 0.13528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4309974913939992e-06, "loss": 0.0, "num_tokens": 71599570.0, "reward": 0.8097125291824341, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8097125291824341, "rewards/reward_fn/std": 1.278932809829712, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07435600832104683, "epoch": 0.13536, "grad_norm": 0.0, "learning_rate": 3.4308687087568045e-06, "loss": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.7578125, "completions/mean_terminated_length": 240.2881317138672, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06838670000433922, "epoch": 0.13544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4307398084765535e-06, "loss": 0.0, "num_tokens": 71696947.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06613919138908386, "epoch": 0.13552, "grad_norm": 0.0, "learning_rate": 3.430610790562268e-06, "loss": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.0546875, "completions/mean_terminated_length": 239.12046813964844, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06555985286831856, "epoch": 0.1356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4304816550229775e-06, "loss": 0.0, "num_tokens": 71793850.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06587620452046394, "epoch": 0.13568, "grad_norm": 0.0, "learning_rate": 3.4303524018677207e-06, "loss": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.9375, "completions/mean_terminated_length": 238.3636474609375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.06625334918498993, "epoch": 0.13576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4302230311055437e-06, "loss": 0.0, "num_tokens": 71891378.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06306180357933044, "epoch": 0.13584, "grad_norm": 0.0, "learning_rate": 3.430093542745501e-06, "loss": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.484375, "completions/mean_terminated_length": 226.9534912109375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.05915193818509579, "epoch": 0.13592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.429963936796655e-06, "loss": 0.0, "num_tokens": 71987184.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06037767417728901, "epoch": 0.136, "grad_norm": 0.0, "learning_rate": 3.4298342132680775e-06, "loss": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.890625, "completions/mean_terminated_length": 231.6666717529297, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.0702890008687973, "epoch": 0.13608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4297043721688473e-06, "loss": 0.0, "num_tokens": 72083298.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06811826303601265, "epoch": 0.13616, "grad_norm": 0.0, "learning_rate": 3.4295744135080525e-06, "loss": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.609375, "completions/mean_terminated_length": 219.30189514160156, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07356718182563782, "epoch": 0.13624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.429444337294788e-06, "loss": 0.0, "num_tokens": 72177712.0, "reward": 0.45800459384918213, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45800459384918213, "rewards/reward_fn/std": 0.9889340400695801, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07189946249127388, "epoch": 0.13632, "grad_norm": 0.0, "learning_rate": 3.4293141435381587e-06, "loss": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.9296875, "completions/mean_terminated_length": 230.3000030517578, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.07160982117056847, "epoch": 0.1364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.429183832247276e-06, "loss": 0.0, "num_tokens": 72273703.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273931503296, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06493500620126724, "epoch": 0.13648, "grad_norm": 0.0, "learning_rate": 3.42905340343126e-06, "loss": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.7421875, "completions/mean_terminated_length": 222.61428833007812, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06835564970970154, "epoch": 0.13656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4289228570992406e-06, "loss": 0.0, "num_tokens": 72369670.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06922302022576332, "epoch": 0.13664, "grad_norm": 0.0, "learning_rate": 3.4287921932603534e-06, "loss": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 211.8515625, "completions/mean_terminated_length": 202.1809539794922, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07143346220254898, "epoch": 0.13672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.428661411923745e-06, "loss": 0.0, "num_tokens": 72462323.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07042481377720833, "epoch": 0.1368, "grad_norm": 0.0, "learning_rate": 3.428530513098567e-06, "loss": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.4296875, "completions/mean_terminated_length": 202.29730224609375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07035113871097565, "epoch": 0.13688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4283994967939824e-06, "loss": 0.0, "num_tokens": 72554666.0, "reward": 0.05376052483916283, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05376052483916283, "rewards/reward_fn/std": 0.1427958756685257, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07114414125680923, "epoch": 0.13696, "grad_norm": 0.0, "learning_rate": 3.4282683630191594e-06, "loss": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.578125, "completions/mean_terminated_length": 215.4864959716797, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06828009709715843, "epoch": 0.13704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.428137111783277e-06, "loss": 0.0, "num_tokens": 72649972.0, "reward": 0.3638491630554199, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3638491630554199, "rewards/reward_fn/std": 0.4668463468551636, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06691822782158852, "epoch": 0.13712, "grad_norm": 0.0, "learning_rate": 3.428005743095522e-06, "loss": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.6796875, "completions/mean_terminated_length": 204.14706420898438, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0688379816710949, "epoch": 0.1372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4278742569650878e-06, "loss": 0.0, "num_tokens": 72742987.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032530754804611, "epoch": 0.13728, "grad_norm": 0.0, "learning_rate": 3.4277426534011767e-06, "loss": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 239.12725830078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07045421376824379, "epoch": 0.13736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4276109324130004e-06, "loss": 0.0, "num_tokens": 72840363.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06858015060424805, "epoch": 0.13744, "grad_norm": 0.0, "learning_rate": 3.427479094009778e-06, "loss": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.703125, "completions/mean_terminated_length": 219.4141387939453, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07189574837684631, "epoch": 0.13752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4273471382007365e-06, "loss": 0.0, "num_tokens": 72935045.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07434603199362755, "epoch": 0.1376, "grad_norm": 0.0, "learning_rate": 3.4272150649951117e-06, "loss": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.453125, "completions/mean_terminated_length": 233.83334350585938, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06891127675771713, "epoch": 0.13768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4270828744021464e-06, "loss": 0.0, "num_tokens": 73031487.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06454328075051308, "epoch": 0.13776, "grad_norm": 0.0, "learning_rate": 3.426950566431094e-06, "loss": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6171875, "completions/mean_terminated_length": 236.6290283203125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06912524998188019, "epoch": 0.13784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.426818141091213e-06, "loss": 0.0, "num_tokens": 73128590.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0657689981162548, "epoch": 0.13792, "grad_norm": 0.0, "learning_rate": 3.4266855983917737e-06, "loss": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.96875, "completions/mean_terminated_length": 222.8235321044922, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06969829276204109, "epoch": 0.138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.426552938342051e-06, "loss": 0.0, "num_tokens": 73224074.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06988406926393509, "epoch": 0.13808, "grad_norm": 0.0, "learning_rate": 3.426420160951331e-06, "loss": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6875, "completions/mean_terminated_length": 216.3333282470703, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06174550577998161, "epoch": 0.13816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4262872662289055e-06, "loss": 0.0, "num_tokens": 73319522.0, "reward": 0.0688910037279129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0688910037279129, "rewards/reward_fn/std": 0.18298465013504028, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06332435458898544, "epoch": 0.13824, "grad_norm": 0.0, "learning_rate": 3.4261542541840765e-06, "loss": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.359375, "completions/mean_terminated_length": 225.71083068847656, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07397696375846863, "epoch": 0.13832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.426021124826153e-06, "loss": 0.0, "num_tokens": 73415312.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07393670827150345, "epoch": 0.1384, "grad_norm": 0.0, "learning_rate": 3.425887878164453e-06, "loss": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.40625, "completions/mean_terminated_length": 201.9091033935547, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06835473328828812, "epoch": 0.13848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4257545142083026e-06, "loss": 0.0, "num_tokens": 73511236.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06584029644727707, "epoch": 0.13856, "grad_norm": 0.0, "learning_rate": 3.4256210329670354e-06, "loss": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.90625, "completions/mean_terminated_length": 233.56521606445312, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07245631143450737, "epoch": 0.13864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4254874344499935e-06, "loss": 0.0, "num_tokens": 73607992.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06903592869639397, "epoch": 0.13872, "grad_norm": 0.0, "learning_rate": 3.425353718666527e-06, "loss": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.015625, "completions/mean_terminated_length": 222.34210205078125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06756703183054924, "epoch": 0.1388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4252198856259966e-06, "loss": 0.0, "num_tokens": 73703738.0, "reward": 0.0962333157658577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0962333157658577, "rewards/reward_fn/std": 0.2556098699569702, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06871025636792183, "epoch": 0.13888, "grad_norm": 0.0, "learning_rate": 3.4250859353377675e-06, "loss": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.09375, "completions/mean_terminated_length": 217.30337524414062, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06660888344049454, "epoch": 0.13896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4249518678112147e-06, "loss": 0.0, "num_tokens": 73798598.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0653378926217556, "epoch": 0.13904, "grad_norm": 0.0, "learning_rate": 3.4248176830557226e-06, "loss": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.21875, "completions/mean_terminated_length": 238.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.06780709698796272, "epoch": 0.13912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.424683381080682e-06, "loss": 0.0, "num_tokens": 73895522.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07189622521400452, "epoch": 0.1392, "grad_norm": 0.0, "learning_rate": 3.424548961895492e-06, "loss": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.9921875, "completions/mean_terminated_length": 242.51315307617188, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0693771056830883, "epoch": 0.13928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.424414425509562e-06, "loss": 0.0, "num_tokens": 73992801.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07005584239959717, "epoch": 0.13936, "grad_norm": 0.0, "learning_rate": 3.4242797719323075e-06, "loss": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.1015625, "completions/mean_terminated_length": 237.0895538330078, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.06793301925063133, "epoch": 0.13944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4241450011731522e-06, "loss": 0.0, "num_tokens": 74089838.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06712685152888298, "epoch": 0.13952, "grad_norm": 0.0, "learning_rate": 3.4240101132415292e-06, "loss": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.34375, "completions/mean_terminated_length": 216.865966796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06716321036219597, "epoch": 0.1396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4238751081468796e-06, "loss": 0.0, "num_tokens": 74184346.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06660891696810722, "epoch": 0.13968, "grad_norm": 0.0, "learning_rate": 3.423739985898651e-06, "loss": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 205.3828125, "completions/mean_terminated_length": 187.8000030517578, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06727252155542374, "epoch": 0.13976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4236047465063023e-06, "loss": 0.0, "num_tokens": 74276171.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06460980325937271, "epoch": 0.13984, "grad_norm": 0.0, "learning_rate": 3.4234693899792977e-06, "loss": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.4140625, "completions/mean_terminated_length": 202.31521606445312, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07825781404972076, "epoch": 0.13992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.423333916327111e-06, "loss": 0.0, "num_tokens": 74369536.0, "reward": 0.4472954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4472954273223877, "rewards/reward_fn/std": 0.9870926737785339, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0748174674808979, "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 3.4231983255592233e-06, "loss": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.2265625, "completions/mean_terminated_length": 201.24298095703125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 0.07419058308005333, "epoch": 0.14008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4230626176851257e-06, "loss": 0.0, "num_tokens": 74461981.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07255548983812332, "epoch": 0.14016, "grad_norm": 0.0, "learning_rate": 3.4229267927143157e-06, "loss": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.0234375, "completions/mean_terminated_length": 207.86021423339844, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06637454032897949, "epoch": 0.14024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4227908506563e-06, "loss": 0.0, "num_tokens": 74555808.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06596282124519348, "epoch": 0.14032, "grad_norm": 0.0, "learning_rate": 3.4226547915205922e-06, "loss": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.46875, "completions/mean_terminated_length": 241.6119384765625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07435205578804016, "epoch": 0.1404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4225186153167155e-06, "loss": 0.0, "num_tokens": 74653148.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273335456848, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07131052389740944, "epoch": 0.14048, "grad_norm": 0.0, "learning_rate": 3.4223823220542013e-06, "loss": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.3828125, "completions/mean_terminated_length": 231.26744079589844, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.07207894325256348, "epoch": 0.14056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.422245911742588e-06, "loss": 0.0, "num_tokens": 74749325.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07412123680114746, "epoch": 0.14064, "grad_norm": 0.0, "learning_rate": 3.4221093843914232e-06, "loss": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.4765625, "completions/mean_terminated_length": 207.5233612060547, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06459341756999493, "epoch": 0.14072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4219727400102625e-06, "loss": 0.0, "num_tokens": 74842442.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06594053655862808, "epoch": 0.1408, "grad_norm": 0.0, "learning_rate": 3.4218359786086693e-06, "loss": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.0234375, "completions/mean_terminated_length": 239.53225708007812, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06803019717335701, "epoch": 0.14088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.421699100196215e-06, "loss": 0.0, "num_tokens": 74939725.0, "reward": 0.10495676845312119, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10495676845312119, "rewards/reward_fn/std": 0.27878063917160034, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06906696408987045, "epoch": 0.14096, "grad_norm": 0.0, "learning_rate": 3.421562104782481e-06, "loss": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.984375, "completions/mean_terminated_length": 223.60919189453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.0780847854912281, "epoch": 0.14104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.421424992377054e-06, "loss": 0.0, "num_tokens": 75035211.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07409156113862991, "epoch": 0.14112, "grad_norm": 0.0, "learning_rate": 3.421287762989531e-06, "loss": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2421875, "completions/mean_terminated_length": 236.4545440673828, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06552881747484207, "epoch": 0.1412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4211504166295175e-06, "loss": 0.0, "num_tokens": 75132010.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0648694857954979, "epoch": 0.14128, "grad_norm": 0.0, "learning_rate": 3.4210129533066253e-06, "loss": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.265625, "completions/mean_terminated_length": 235.49253845214844, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.0723014660179615, "epoch": 0.14136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4208753730304755e-06, "loss": 0.0, "num_tokens": 75228940.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045934349298477, "epoch": 0.14144, "grad_norm": 0.0, "learning_rate": 3.4207376758106975e-06, "loss": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 216.59375, "completions/mean_terminated_length": 194.48779296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.06877072155475616, "epoch": 0.14152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.420599861656928e-06, "loss": 0.0, "num_tokens": 75322200.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06988411769270897, "epoch": 0.1416, "grad_norm": 0.0, "learning_rate": 3.4204619305788135e-06, "loss": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 218.453125, "completions/mean_terminated_length": 205.9375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07087964564561844, "epoch": 0.14168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4203238825860067e-06, "loss": 0.0, "num_tokens": 75415698.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06836184114217758, "epoch": 0.14176, "grad_norm": 0.0, "learning_rate": 3.4201857176881707e-06, "loss": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 206.1999969482422, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07032632827758789, "epoch": 0.14184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4200474358949747e-06, "loss": 0.0, "num_tokens": 75510018.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06956849992275238, "epoch": 0.14192, "grad_norm": 0.0, "learning_rate": 3.4199090372160973e-06, "loss": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 225.5072479248047, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07094819098711014, "epoch": 0.142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.419770521661225e-06, "loss": 0.0, "num_tokens": 75606218.0, "reward": 0.4998975992202759, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4998975992202759, "rewards/reward_fn/std": 1.0038779973983765, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.069453664124012, "epoch": 0.14208, "grad_norm": 0.0, "learning_rate": 3.419631889240052e-06, "loss": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.90625, "completions/mean_terminated_length": 206.58460998535156, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06235708296298981, "epoch": 0.14216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4194931399622813e-06, "loss": 0.0, "num_tokens": 75701310.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06353932246565819, "epoch": 0.14224, "grad_norm": 0.0, "learning_rate": 3.4193542738376244e-06, "loss": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.3515625, "completions/mean_terminated_length": 234.3333282470703, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06990615278482437, "epoch": 0.14232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4192152908758003e-06, "loss": 0.0, "num_tokens": 75798379.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06834063678979874, "epoch": 0.1424, "grad_norm": 0.0, "learning_rate": 3.419076191086535e-06, "loss": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.328125, "completions/mean_terminated_length": 206.67532348632812, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06850584223866463, "epoch": 0.14248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.418936974479566e-06, "loss": 0.0, "num_tokens": 75892885.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07183603197336197, "epoch": 0.14256, "grad_norm": 0.0, "learning_rate": 3.4187976410646362e-06, "loss": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 210.296875, "completions/mean_terminated_length": 203.29730224609375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07312837988138199, "epoch": 0.14264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4186581908514965e-06, "loss": 0.0, "num_tokens": 75985339.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07660682499408722, "epoch": 0.14272, "grad_norm": 0.0, "learning_rate": 3.4185186238499085e-06, "loss": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.78125, "completions/mean_terminated_length": 234.90321350097656, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.06698597222566605, "epoch": 0.1428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.41837894006964e-06, "loss": 0.0, "num_tokens": 76082335.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06652442365884781, "epoch": 0.14288, "grad_norm": 0.0, "learning_rate": 3.418239139520466e-06, "loss": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.984375, "completions/mean_terminated_length": 210.06521606445312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0671536810696125, "epoch": 0.14296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4180992222121736e-06, "loss": 0.0, "num_tokens": 76176413.0, "reward": 0.7524996995925903, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7524996995925903, "rewards/reward_fn/std": 1.302709698677063, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06775333359837532, "epoch": 0.14304, "grad_norm": 0.0, "learning_rate": 3.417959188154553e-06, "loss": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.015625, "completions/mean_terminated_length": 229.6288604736328, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06278564780950546, "epoch": 0.14312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4178190373574064e-06, "loss": 0.0, "num_tokens": 76272159.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06327282637357712, "epoch": 0.1432, "grad_norm": 0.0, "learning_rate": 3.4176787698305428e-06, "loss": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.671875, "completions/mean_terminated_length": 232.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06649286299943924, "epoch": 0.14328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.417538385583779e-06, "loss": 0.0, "num_tokens": 76368373.0, "reward": 0.05828011780977249, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05828011780977249, "rewards/reward_fn/std": 0.1473591923713684, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06615026295185089, "epoch": 0.14336, "grad_norm": 0.0, "learning_rate": 3.4173978846269412e-06, "loss": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.3359375, "completions/mean_terminated_length": 236.3552703857422, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07734404504299164, "epoch": 0.14344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4172572669698625e-06, "loss": 0.0, "num_tokens": 76465184.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07218518853187561, "epoch": 0.14352, "grad_norm": 0.0, "learning_rate": 3.4171165326223843e-06, "loss": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.9296875, "completions/mean_terminated_length": 207.91346740722656, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06594178825616837, "epoch": 0.1436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4169756815943573e-06, "loss": 0.0, "num_tokens": 76558487.0, "reward": 0.85834801197052, "reward_std": 0.0, "rewards/reward_fn/mean": 0.85834801197052, "rewards/reward_fn/std": 1.272716760635376, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06829003244638443, "epoch": 0.14368, "grad_norm": 0.0, "learning_rate": 3.416834713895639e-06, "loss": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.6640625, "completions/mean_terminated_length": 214.19192504882812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06635947152972221, "epoch": 0.14376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.416693629536096e-06, "loss": 0.0, "num_tokens": 76652652.0, "reward": 0.12486984580755234, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12486984580755234, "rewards/reward_fn/std": 0.33167269825935364, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06366812065243721, "epoch": 0.14384, "grad_norm": 0.0, "learning_rate": 3.4165524285256024e-06, "loss": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.9296875, "completions/mean_terminated_length": 232.50537109375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07496332004666328, "epoch": 0.14392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.416411110874041e-06, "loss": 0.0, "num_tokens": 76748771.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07763902842998505, "epoch": 0.144, "grad_norm": 0.0, "learning_rate": 3.4162696765913026e-06, "loss": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.703125, "completions/mean_terminated_length": 225.435302734375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.07523301243782043, "epoch": 0.14408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4161281256872856e-06, "loss": 0.0, "num_tokens": 76844477.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0719352662563324, "epoch": 0.14416, "grad_norm": 0.0, "learning_rate": 3.415986458171898e-06, "loss": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 211.03797912597656, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07407417893409729, "epoch": 0.14424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4158446740550538e-06, "loss": 0.0, "num_tokens": 76939229.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07737919688224792, "epoch": 0.14432, "grad_norm": 0.0, "learning_rate": 3.4157027733466774e-06, "loss": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.0703125, "completions/mean_terminated_length": 187.27957153320312, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.05941596440970898, "epoch": 0.1444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4155607560567e-06, "loss": 0.0, "num_tokens": 77031142.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05841292627155781, "epoch": 0.14448, "grad_norm": 0.0, "learning_rate": 3.415418622195061e-06, "loss": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 207.921875, "completions/mean_terminated_length": 199.01852416992188, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05998528003692627, "epoch": 0.14456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4152763717717094e-06, "loss": 0.0, "num_tokens": 77123292.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060562120750546455, "epoch": 0.14464, "grad_norm": 0.0, "learning_rate": 3.4151340047965996e-06, "loss": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.0625, "completions/mean_terminated_length": 237.79591369628906, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06818773597478867, "epoch": 0.14472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.414991521279697e-06, "loss": 0.0, "num_tokens": 77219812.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06523017585277557, "epoch": 0.1448, "grad_norm": 0.0, "learning_rate": 3.4148489212309734e-06, "loss": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.7109375, "completions/mean_terminated_length": 236.74737548828125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06481752172112465, "epoch": 0.14488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4147062046604093e-06, "loss": 0.0, "num_tokens": 77316287.0, "reward": 0.12489760667085648, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12489760667085648, "rewards/reward_fn/std": 0.3317464292049408, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06590625643730164, "epoch": 0.14496, "grad_norm": 0.0, "learning_rate": 3.414563371577994e-06, "loss": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.9375, "completions/mean_terminated_length": 241.46478271484375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.0643000639975071, "epoch": 0.14504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.414420421993723e-06, "loss": 0.0, "num_tokens": 77413559.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06411885470151901, "epoch": 0.14512, "grad_norm": 0.0, "learning_rate": 3.414277355917603e-06, "loss": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.8828125, "completions/mean_terminated_length": 195.67469787597656, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07075734063982964, "epoch": 0.1452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4141341733596454e-06, "loss": 0.0, "num_tokens": 77506856.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06801024824380875, "epoch": 0.14528, "grad_norm": 0.0, "learning_rate": 3.4139908743298727e-06, "loss": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 225.765625, "completions/mean_terminated_length": 188.1052703857422, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07359801232814789, "epoch": 0.14536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.413847458838314e-06, "loss": 0.0, "num_tokens": 77601290.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07241496071219444, "epoch": 0.14544, "grad_norm": 0.0, "learning_rate": 3.4137039268950064e-06, "loss": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1328125, "completions/mean_terminated_length": 233.0319061279297, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06456141546368599, "epoch": 0.14552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.413560278509996e-06, "loss": 0.0, "num_tokens": 77697435.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06379830837249756, "epoch": 0.1456, "grad_norm": 0.0, "learning_rate": 3.4134165136933366e-06, "loss": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 233.09091186523438, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06845008581876755, "epoch": 0.14568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4132726324550903e-06, "loss": 0.0, "num_tokens": 77793975.0, "reward": 0.1965917944908142, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1965917944908142, "rewards/reward_fn/std": 0.34731626510620117, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06909046322107315, "epoch": 0.14576, "grad_norm": 0.0, "learning_rate": 3.4131286348053275e-06, "loss": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.3203125, "completions/mean_terminated_length": 184.5124969482422, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06790284439921379, "epoch": 0.14584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4129845207541252e-06, "loss": 0.0, "num_tokens": 77886560.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07006208226084709, "epoch": 0.14592, "grad_norm": 0.0, "learning_rate": 3.4128402903115718e-06, "loss": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.8203125, "completions/mean_terminated_length": 235.13792419433594, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06279975548386574, "epoch": 0.146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4126959434877607e-06, "loss": 0.0, "num_tokens": 77983049.0, "reward": 0.2068842053413391, "reward_std": 0.0, "rewards/reward_fn/mean": 0.2068842053413391, "rewards/reward_fn/std": 0.36670902371406555, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06615272909402847, "epoch": 0.14608, "grad_norm": 0.0, "learning_rate": 3.412551480292795e-06, "loss": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.640625, "completions/mean_terminated_length": 216.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.0710468590259552, "epoch": 0.14616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4124069007367854e-06, "loss": 0.0, "num_tokens": 78078491.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07191426306962967, "epoch": 0.14624, "grad_norm": 0.0, "learning_rate": 3.4122622048298513e-06, "loss": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.8671875, "completions/mean_terminated_length": 211.8229217529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07504767924547195, "epoch": 0.14632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.41211739258212e-06, "loss": 0.0, "num_tokens": 78172554.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07260722666978836, "epoch": 0.1464, "grad_norm": 0.0, "learning_rate": 3.4119724640037263e-06, "loss": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.1328125, "completions/mean_terminated_length": 242.04763793945312, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.07441423460841179, "epoch": 0.14648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4118274191048133e-06, "loss": 0.0, "num_tokens": 78269979.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07225159555673599, "epoch": 0.14656, "grad_norm": 0.0, "learning_rate": 3.411682257895534e-06, "loss": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.0546875, "completions/mean_terminated_length": 199.69723510742188, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.059621887281537056, "epoch": 0.14664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4115369803860465e-06, "loss": 0.0, "num_tokens": 78362146.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06030483916401863, "epoch": 0.14672, "grad_norm": 0.0, "learning_rate": 3.41139158658652e-06, "loss": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.9375, "completions/mean_terminated_length": 210.42105102539062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.08056149631738663, "epoch": 0.1468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.41124607650713e-06, "loss": 0.0, "num_tokens": 78456986.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07472280785441399, "epoch": 0.14688, "grad_norm": 0.0, "learning_rate": 3.4111004501580606e-06, "loss": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 234.1265869140625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.0647394172847271, "epoch": 0.14696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4109547075495043e-06, "loss": 0.0, "num_tokens": 78553562.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0653931237757206, "epoch": 0.14704, "grad_norm": 0.0, "learning_rate": 3.410808848691661e-06, "loss": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.5078125, "completions/mean_terminated_length": 220.53260803222656, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06953935325145721, "epoch": 0.14712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4106628735947405e-06, "loss": 0.0, "num_tokens": 78648603.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07023197412490845, "epoch": 0.1472, "grad_norm": 0.0, "learning_rate": 3.4105167822689587e-06, "loss": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7890625, "completions/mean_terminated_length": 232.5421600341797, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.06668455898761749, "epoch": 0.14728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4103705747245403e-06, "loss": 0.0, "num_tokens": 78744960.0, "reward": 0.12281021475791931, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12281021475791931, "rewards/reward_fn/std": 0.31130945682525635, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06792976334691048, "epoch": 0.14736, "grad_norm": 0.0, "learning_rate": 3.4102242509717184e-06, "loss": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.0234375, "completions/mean_terminated_length": 240.53030395507812, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06738186255097389, "epoch": 0.14744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4100778110207344e-06, "loss": 0.0, "num_tokens": 78842243.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.068658497184515, "epoch": 0.14752, "grad_norm": 0.0, "learning_rate": 3.4099312548818374e-06, "loss": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.3359375, "completions/mean_terminated_length": 217.25509643554688, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07556789368391037, "epoch": 0.1476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.409784582565284e-06, "loss": 0.0, "num_tokens": 78936750.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07171772420406342, "epoch": 0.14768, "grad_norm": 0.0, "learning_rate": 3.4096377940813414e-06, "loss": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.8203125, "completions/mean_terminated_length": 212.44595336914062, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07357122004032135, "epoch": 0.14776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.409490889440282e-06, "loss": 0.0, "num_tokens": 79031831.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07403568178415298, "epoch": 0.14784, "grad_norm": 0.0, "learning_rate": 3.4093438686523878e-06, "loss": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.078125, "completions/mean_terminated_length": 243.36668395996094, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.06499994173645973, "epoch": 0.14792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4091967317279486e-06, "loss": 0.0, "num_tokens": 79129377.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0658382698893547, "epoch": 0.148, "grad_norm": 0.0, "learning_rate": 3.409049478677263e-06, "loss": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 210.2278594970703, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06895875185728073, "epoch": 0.14808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.408902109510637e-06, "loss": 0.0, "num_tokens": 79224065.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0704876147210598, "epoch": 0.14816, "grad_norm": 0.0, "learning_rate": 3.4087546242383843e-06, "loss": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.90625, "completions/mean_terminated_length": 241.11474609375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.08040731400251389, "epoch": 0.14824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.408607022870827e-06, "loss": 0.0, "num_tokens": 79321461.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07640871405601501, "epoch": 0.14832, "grad_norm": 0.0, "learning_rate": 3.408459305418297e-06, "loss": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.2578125, "completions/mean_terminated_length": 232.36231994628906, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06459390744566917, "epoch": 0.1484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4083114718911323e-06, "loss": 0.0, "num_tokens": 79418134.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06466831639409065, "epoch": 0.14848, "grad_norm": 0.0, "learning_rate": 3.4081635222996797e-06, "loss": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.578125, "completions/mean_terminated_length": 234.8000030517578, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.07376763969659805, "epoch": 0.14856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4080154566542935e-06, "loss": 0.0, "num_tokens": 79514848.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07531635090708733, "epoch": 0.14864, "grad_norm": 0.0, "learning_rate": 3.4078672749653377e-06, "loss": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.453125, "completions/mean_terminated_length": 231.7755126953125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.06536349281668663, "epoch": 0.14872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4077189772431833e-06, "loss": 0.0, "num_tokens": 79610778.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.063922930508852, "epoch": 0.1488, "grad_norm": 0.0, "learning_rate": 3.407570563498209e-06, "loss": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.71875, "completions/mean_terminated_length": 228.8333282470703, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06813589856028557, "epoch": 0.14888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4074220337408027e-06, "loss": 0.0, "num_tokens": 79707126.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06831956282258034, "epoch": 0.14896, "grad_norm": 0.0, "learning_rate": 3.4072733879813593e-06, "loss": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.6796875, "completions/mean_terminated_length": 227.38356018066406, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06832100078463554, "epoch": 0.14904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4071246262302834e-06, "loss": 0.0, "num_tokens": 79803341.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06885804235935211, "epoch": 0.14912, "grad_norm": 0.0, "learning_rate": 3.4069757484979858e-06, "loss": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.984375, "completions/mean_terminated_length": 239.77215576171875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.07070551812648773, "epoch": 0.1492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4068267547948866e-06, "loss": 0.0, "num_tokens": 79900363.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07216610386967659, "epoch": 0.14928, "grad_norm": 0.0, "learning_rate": 3.4066776451314142e-06, "loss": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.0390625, "completions/mean_terminated_length": 212.57546997070312, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06720560044050217, "epoch": 0.14936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4065284195180045e-06, "loss": 0.0, "num_tokens": 79994064.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679625511169, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07080528512597084, "epoch": 0.14944, "grad_norm": 0.0, "learning_rate": 3.406379077965102e-06, "loss": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.6796875, "completions/mean_terminated_length": 201.30589294433594, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06996375322341919, "epoch": 0.14952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.406229620483158e-06, "loss": 0.0, "num_tokens": 80087719.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07176362723112106, "epoch": 0.1496, "grad_norm": 0.0, "learning_rate": 3.4060800470826344e-06, "loss": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.15625, "completions/mean_terminated_length": 231.46267700195312, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07188083976507187, "epoch": 0.14968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4059303577739986e-06, "loss": 0.0, "num_tokens": 80184379.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07181189581751823, "epoch": 0.14976, "grad_norm": 0.0, "learning_rate": 3.4057805525677275e-06, "loss": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.109375, "completions/mean_terminated_length": 192.13462829589844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07049412280321121, "epoch": 0.14984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4056306314743066e-06, "loss": 0.0, "num_tokens": 80276041.0, "reward": 1.1544369459152222, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1544369459152222, "rewards/reward_fn/std": 1.437130093574524, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0717724971473217, "epoch": 0.14992, "grad_norm": 0.0, "learning_rate": 3.405480594504228e-06, "loss": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.015625, "completions/mean_terminated_length": 240.02500915527344, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.06742884963750839, "epoch": 0.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.405330441667993e-06, "loss": 0.0, "num_tokens": 80373067.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06506720557808876, "epoch": 0.15008, "grad_norm": 0.0, "learning_rate": 3.4051801729761103e-06, "loss": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.3828125, "completions/mean_terminated_length": 210.49514770507812, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06804550439119339, "epoch": 0.15016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.405029788439098e-06, "loss": 0.0, "num_tokens": 80466684.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032667845487595, "epoch": 0.15024, "grad_norm": 0.0, "learning_rate": 3.404879288067481e-06, "loss": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.65625, "completions/mean_terminated_length": 216.13186645507812, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07829012349247932, "epoch": 0.15032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4047286718717923e-06, "loss": 0.0, "num_tokens": 80561360.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07862139120697975, "epoch": 0.1504, "grad_norm": 0.0, "learning_rate": 3.404577939862574e-06, "loss": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.1171875, "completions/mean_terminated_length": 241.37681579589844, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07338596507906914, "epoch": 0.15048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.404427092050375e-06, "loss": 0.0, "num_tokens": 80658655.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07269137352705002, "epoch": 0.15056, "grad_norm": 0.0, "learning_rate": 3.4042761284457544e-06, "loss": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.71875, "completions/mean_terminated_length": 235.1111297607422, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06981110200285912, "epoch": 0.15064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4041250490592773e-06, "loss": 0.0, "num_tokens": 80755643.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07146303355693817, "epoch": 0.15072, "grad_norm": 0.0, "learning_rate": 3.4039738539015173e-06, "loss": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.3984375, "completions/mean_terminated_length": 228.34999084472656, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06542781740427017, "epoch": 0.1508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4038225429830564e-06, "loss": 0.0, "num_tokens": 80851182.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06937597319483757, "epoch": 0.15088, "grad_norm": 0.0, "learning_rate": 3.4036711163144855e-06, "loss": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.9609375, "completions/mean_terminated_length": 218.9666748046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06779516488313675, "epoch": 0.15096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.403519573906403e-06, "loss": 0.0, "num_tokens": 80946153.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06865144148468971, "epoch": 0.15104, "grad_norm": 0.0, "learning_rate": 3.4033679157694143e-06, "loss": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.1484375, "completions/mean_terminated_length": 211.36363220214844, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07023539394140244, "epoch": 0.15112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.403216141914134e-06, "loss": 0.0, "num_tokens": 81041020.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143033295869827, "epoch": 0.1512, "grad_norm": 0.0, "learning_rate": 3.403064252351186e-06, "loss": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.578125, "completions/mean_terminated_length": 234.54348754882812, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.06860479339957237, "epoch": 0.15128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.402912247091199e-06, "loss": 0.0, "num_tokens": 81137350.0, "reward": 0.10180176049470901, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10180176049470901, "rewards/reward_fn/std": 0.2704004645347595, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06814040243625641, "epoch": 0.15136, "grad_norm": 0.0, "learning_rate": 3.4027601261448132e-06, "loss": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.2890625, "completions/mean_terminated_length": 187.18055725097656, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06334582716226578, "epoch": 0.15144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.402607889522675e-06, "loss": 0.0, "num_tokens": 81230699.0, "reward": 0.7524996995925903, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7524996995925903, "rewards/reward_fn/std": 1.302709698677063, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06180976890027523, "epoch": 0.15152, "grad_norm": 0.0, "learning_rate": 3.40245553723544e-06, "loss": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.9453125, "completions/mean_terminated_length": 232.9701385498047, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06709641218185425, "epoch": 0.1516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4023030692937697e-06, "loss": 0.0, "num_tokens": 81327460.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06793367862701416, "epoch": 0.15168, "grad_norm": 0.0, "learning_rate": 3.4021504857083365e-06, "loss": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.234375, "completions/mean_terminated_length": 207.6170196533203, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07452854514122009, "epoch": 0.15176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4019977864898193e-06, "loss": 0.0, "num_tokens": 81423490.0, "reward": 0.581389307975769, "reward_std": 0.0, "rewards/reward_fn/mean": 0.581389307975769, "rewards/reward_fn/std": 0.9841175675392151, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07527808845043182, "epoch": 0.15184, "grad_norm": 0.0, "learning_rate": 3.4018449716489055e-06, "loss": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 242.71875, "completions/mean_terminated_length": 237.11111450195312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06326575204730034, "epoch": 0.15192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4016920411962904e-06, "loss": 0.0, "num_tokens": 81520094.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06384564936161041, "epoch": 0.152, "grad_norm": 0.0, "learning_rate": 3.4015389951426773e-06, "loss": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.5703125, "completions/mean_terminated_length": 236.01162719726562, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.07068263366818428, "epoch": 0.15208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4013858334987782e-06, "loss": 0.0, "num_tokens": 81616679.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06794620677828789, "epoch": 0.15216, "grad_norm": 0.0, "learning_rate": 3.401232556275313e-06, "loss": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.7734375, "completions/mean_terminated_length": 228.30667114257812, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.07631449028849602, "epoch": 0.15224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.401079163483009e-06, "loss": 0.0, "num_tokens": 81712906.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07699376717209816, "epoch": 0.15232, "grad_norm": 0.0, "learning_rate": 3.4009256551326025e-06, "loss": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.2890625, "completions/mean_terminated_length": 238.578125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.06844865530729294, "epoch": 0.1524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.400772031234837e-06, "loss": 0.0, "num_tokens": 81810095.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.065402552485466, "epoch": 0.15248, "grad_norm": 0.0, "learning_rate": 3.400618291800465e-06, "loss": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.8671875, "completions/mean_terminated_length": 232.31631469726562, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06652498245239258, "epoch": 0.15256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4004644368402457e-06, "loss": 0.0, "num_tokens": 81906078.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07051102072000504, "epoch": 0.15264, "grad_norm": 0.0, "learning_rate": 3.4003104663649483e-06, "loss": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.953125, "completions/mean_terminated_length": 241.0465087890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06978559494018555, "epoch": 0.15272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.4001563803853494e-06, "loss": 0.0, "num_tokens": 82003096.0, "reward": 0.10094611346721649, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10094611346721649, "rewards/reward_fn/std": 0.2681277394294739, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06891292333602905, "epoch": 0.1528, "grad_norm": 0.0, "learning_rate": 3.4000021789122326e-06, "loss": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.828125, "completions/mean_terminated_length": 203.7391357421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07656038552522659, "epoch": 0.15288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.39984786195639e-06, "loss": 0.0, "num_tokens": 82097794.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07879335805773735, "epoch": 0.15296, "grad_norm": 0.0, "learning_rate": 3.3996934295286235e-06, "loss": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.0078125, "completions/mean_terminated_length": 217.05435180664062, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06367248296737671, "epoch": 0.15304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.39953888163974e-06, "loss": 0.0, "num_tokens": 82192515.0, "reward": 0.12084341049194336, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12084341049194336, "rewards/reward_fn/std": 0.3209778666496277, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.062253063544631004, "epoch": 0.15312, "grad_norm": 0.0, "learning_rate": 3.399384218300558e-06, "loss": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.296875, "completions/mean_terminated_length": 210.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06893860548734665, "epoch": 0.1532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3992294395219016e-06, "loss": 0.0, "num_tokens": 82286633.0, "reward": 0.48566895723342896, "reward_std": 0.0, "rewards/reward_fn/mean": 0.48566895723342896, "rewards/reward_fn/std": 0.9974362850189209, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664789080619812, "epoch": 0.15328, "grad_norm": 0.0, "learning_rate": 3.3990745453146034e-06, "loss": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.9765625, "completions/mean_terminated_length": 242.1692352294922, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.07358858734369278, "epoch": 0.15336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3989195356895043e-06, "loss": 0.0, "num_tokens": 82384038.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07216830924153328, "epoch": 0.15344, "grad_norm": 0.0, "learning_rate": 3.3987644106574537e-06, "loss": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6171875, "completions/mean_terminated_length": 240.60256958007812, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06473383679986, "epoch": 0.15352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3986091702293087e-06, "loss": 0.0, "num_tokens": 82481141.0, "reward": 0.07061244547367096, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07061244547367096, "rewards/reward_fn/std": 0.18755705654621124, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06360270082950592, "epoch": 0.1536, "grad_norm": 0.0, "learning_rate": 3.3984538144159344e-06, "loss": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.4609375, "completions/mean_terminated_length": 225.75714111328125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.07012388855218887, "epoch": 0.15368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.398298343228204e-06, "loss": 0.0, "num_tokens": 82577328.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06650030985474586, "epoch": 0.15376, "grad_norm": 0.0, "learning_rate": 3.398142756676999e-06, "loss": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.53125, "completions/mean_terminated_length": 230.55555725097656, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07089593261480331, "epoch": 0.15384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3979870547732083e-06, "loss": 0.0, "num_tokens": 82672884.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06941617652773857, "epoch": 0.15392, "grad_norm": 0.0, "learning_rate": 3.3978312375277303e-06, "loss": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 214.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06612656638026237, "epoch": 0.154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.39767530495147e-06, "loss": 0.0, "num_tokens": 82767192.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06448361277580261, "epoch": 0.15408, "grad_norm": 0.0, "learning_rate": 3.397519257055341e-06, "loss": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 215.57894897460938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07289231196045876, "epoch": 0.15416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.397363093850265e-06, "loss": 0.0, "num_tokens": 82862424.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0730183757841587, "epoch": 0.15424, "grad_norm": 0.0, "learning_rate": 3.397206815347172e-06, "loss": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.3984375, "completions/mean_terminated_length": 219.13333129882812, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0654946118593216, "epoch": 0.15432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3970504215569996e-06, "loss": 0.0, "num_tokens": 82957963.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0643225759267807, "epoch": 0.1544, "grad_norm": 0.0, "learning_rate": 3.3968939124906938e-06, "loss": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.7109375, "completions/mean_terminated_length": 235.21591186523438, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.06150699220597744, "epoch": 0.15448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.396737288159209e-06, "loss": 0.0, "num_tokens": 83054438.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06394447013735771, "epoch": 0.15456, "grad_norm": 0.0, "learning_rate": 3.3965805485735065e-06, "loss": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.4375, "completions/mean_terminated_length": 226.14083862304688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06909829378128052, "epoch": 0.15464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3964236937445564e-06, "loss": 0.0, "num_tokens": 83150622.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06956316158175468, "epoch": 0.15472, "grad_norm": 0.0, "learning_rate": 3.396266723683338e-06, "loss": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.2265625, "completions/mean_terminated_length": 206.01515197753906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0677366815507412, "epoch": 0.1548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3961096384008363e-06, "loss": 0.0, "num_tokens": 83245627.0, "reward": 0.41141408681869507, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41141408681869507, "rewards/reward_fn/std": 0.9868917465209961, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06665485724806786, "epoch": 0.15488, "grad_norm": 0.0, "learning_rate": 3.3959524379080463e-06, "loss": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 247.8828125, "completions/mean_terminated_length": 239.765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06821661069989204, "epoch": 0.15496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.39579512221597e-06, "loss": 0.0, "num_tokens": 83342892.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0691719576716423, "epoch": 0.15504, "grad_norm": 0.0, "learning_rate": 3.3956376913356184e-06, "loss": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.8671875, "completions/mean_terminated_length": 192.47474670410156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07000164687633514, "epoch": 0.15512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3954801452780096e-06, "loss": 0.0, "num_tokens": 83434907.0, "reward": 0.7722600698471069, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7722600698471069, "rewards/reward_fn/std": 1.292528510093689, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07585688680410385, "epoch": 0.1552, "grad_norm": 0.0, "learning_rate": 3.39532248405417e-06, "loss": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5546875, "completions/mean_terminated_length": 230.85858154296875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.07732851058244705, "epoch": 0.15528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.395164707675134e-06, "loss": 0.0, "num_tokens": 83530722.0, "reward": 0.7770648002624512, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7770648002624512, "rewards/reward_fn/std": 1.2903637886047363, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07801739498972893, "epoch": 0.15536, "grad_norm": 0.0, "learning_rate": 3.3950068161519457e-06, "loss": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.2109375, "completions/mean_terminated_length": 203.25608825683594, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07425682991743088, "epoch": 0.15544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.394848809495654e-06, "loss": 0.0, "num_tokens": 83624701.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07127947360277176, "epoch": 0.15552, "grad_norm": 0.0, "learning_rate": 3.394690687717319e-06, "loss": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.953125, "completions/mean_terminated_length": 230.3249969482422, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07372711598873138, "epoch": 0.1556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.394532450828007e-06, "loss": 0.0, "num_tokens": 83720951.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0745374783873558, "epoch": 0.15568, "grad_norm": 0.0, "learning_rate": 3.3943740988387925e-06, "loss": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.0859375, "completions/mean_terminated_length": 238.08975219726562, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.069059107452631, "epoch": 0.15576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3942156317607594e-06, "loss": 0.0, "num_tokens": 83817858.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07174122706055641, "epoch": 0.15584, "grad_norm": 0.0, "learning_rate": 3.394057049604998e-06, "loss": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.859375, "completions/mean_terminated_length": 236.10989379882812, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06678295135498047, "epoch": 0.15592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.393898352382608e-06, "loss": 0.0, "num_tokens": 83914352.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06707125902175903, "epoch": 0.156, "grad_norm": 0.0, "learning_rate": 3.393739540104696e-06, "loss": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.2109375, "completions/mean_terminated_length": 193.84524536132812, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0631227120757103, "epoch": 0.15608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.393580612782377e-06, "loss": 0.0, "num_tokens": 84007435.0, "reward": 0.09413323551416397, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09413323551416397, "rewards/reward_fn/std": 0.2500317096710205, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0627526305615902, "epoch": 0.15616, "grad_norm": 0.0, "learning_rate": 3.3934215704267755e-06, "loss": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.2109375, "completions/mean_terminated_length": 201.02468872070312, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06771104410290718, "epoch": 0.15624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3932624130490213e-06, "loss": 0.0, "num_tokens": 84101286.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07157555595040321, "epoch": 0.15632, "grad_norm": 0.0, "learning_rate": 3.3931031406602543e-06, "loss": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.5390625, "completions/mean_terminated_length": 236.89999389648438, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07341345399618149, "epoch": 0.1564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3929437532716222e-06, "loss": 0.0, "num_tokens": 84198635.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07318595051765442, "epoch": 0.15648, "grad_norm": 0.0, "learning_rate": 3.39278425089428e-06, "loss": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.7890625, "completions/mean_terminated_length": 238.3854217529297, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07127195596694946, "epoch": 0.15656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3926246335393904e-06, "loss": 0.0, "num_tokens": 84295248.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.072492565959692, "epoch": 0.15664, "grad_norm": 0.0, "learning_rate": 3.3924649012181273e-06, "loss": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 230.27027893066406, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.07181078568100929, "epoch": 0.15672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.392305053941668e-06, "loss": 0.0, "num_tokens": 84391648.0, "reward": 0.06713119894266129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06713119894266129, "rewards/reward_fn/std": 0.1783103495836258, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07294033095240593, "epoch": 0.1568, "grad_norm": 0.0, "learning_rate": 3.392145091721201e-06, "loss": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.171875, "completions/mean_terminated_length": 216.7446746826172, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06789962574839592, "epoch": 0.15688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3919850145679224e-06, "loss": 0.0, "num_tokens": 84486262.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06670743227005005, "epoch": 0.15696, "grad_norm": 0.0, "learning_rate": 3.3918248224930345e-06, "loss": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.2734375, "completions/mean_terminated_length": 206.42999267578125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06596580892801285, "epoch": 0.15704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3916645155077507e-06, "loss": 0.0, "num_tokens": 84579609.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06156945414841175, "epoch": 0.15712, "grad_norm": 0.0, "learning_rate": 3.39150409362329e-06, "loss": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.96875, "completions/mean_terminated_length": 206.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06686685234308243, "epoch": 0.1572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3913435568508807e-06, "loss": 0.0, "num_tokens": 84673557.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06786331906914711, "epoch": 0.15728, "grad_norm": 0.0, "learning_rate": 3.3911829052017573e-06, "loss": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.1640625, "completions/mean_terminated_length": 209.3000030517578, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.08443482220172882, "epoch": 0.15736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3910221386871655e-06, "loss": 0.0, "num_tokens": 84767658.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07725803181529045, "epoch": 0.15744, "grad_norm": 0.0, "learning_rate": 3.390861257318356e-06, "loss": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.078125, "completions/mean_terminated_length": 228.78651428222656, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07102623954415321, "epoch": 0.15752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3907002611065893e-06, "loss": 0.0, "num_tokens": 84863540.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.074496790766716, "epoch": 0.1576, "grad_norm": 0.0, "learning_rate": 3.3905391500631335e-06, "loss": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.328125, "completions/mean_terminated_length": 216.0253143310547, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07510823011398315, "epoch": 0.15768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3903779241992646e-06, "loss": 0.0, "num_tokens": 84958686.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07031358405947685, "epoch": 0.15776, "grad_norm": 0.0, "learning_rate": 3.3902165835262665e-06, "loss": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.3203125, "completions/mean_terminated_length": 238.1648406982422, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06416650488972664, "epoch": 0.15784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.390055128055432e-06, "loss": 0.0, "num_tokens": 85055367.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06749238073825836, "epoch": 0.15792, "grad_norm": 0.0, "learning_rate": 3.3898935577980606e-06, "loss": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.265625, "completions/mean_terminated_length": 243.559326171875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.06604081019759178, "epoch": 0.158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.38973187276546e-06, "loss": 0.0, "num_tokens": 85152937.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0707143247127533, "epoch": 0.15808, "grad_norm": 0.0, "learning_rate": 3.3895700729689477e-06, "loss": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.2265625, "completions/mean_terminated_length": 225.43678283691406, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06616894528269768, "epoch": 0.15816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.389408158419848e-06, "loss": 0.0, "num_tokens": 85248582.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06605223566293716, "epoch": 0.15824, "grad_norm": 0.0, "learning_rate": 3.389246129129492e-06, "loss": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.328125, "completions/mean_terminated_length": 212.84091186523438, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07171432673931122, "epoch": 0.15832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3890839851092208e-06, "loss": 0.0, "num_tokens": 85343088.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07577696442604065, "epoch": 0.1584, "grad_norm": 0.0, "learning_rate": 3.3889217263703826e-06, "loss": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.0859375, "completions/mean_terminated_length": 215.77174377441406, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06539416313171387, "epoch": 0.15848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3887593529243336e-06, "loss": 0.0, "num_tokens": 85437691.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273335456848, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06739253550767899, "epoch": 0.15856, "grad_norm": 0.0, "learning_rate": 3.3885968647824383e-06, "loss": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.21875, "completions/mean_terminated_length": 236.69387817382812, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.0666097104549408, "epoch": 0.15864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.38843426195607e-06, "loss": 0.0, "num_tokens": 85534103.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0691882073879242, "epoch": 0.15872, "grad_norm": 0.0, "learning_rate": 3.388271544456608e-06, "loss": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.6796875, "completions/mean_terminated_length": 235.1704559326172, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06416351720690727, "epoch": 0.1588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3881087122954413e-06, "loss": 0.0, "num_tokens": 85630574.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06552626192569733, "epoch": 0.15888, "grad_norm": 0.0, "learning_rate": 3.3879457654839668e-06, "loss": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.40625, "completions/mean_terminated_length": 211.32203674316406, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.06838884577155113, "epoch": 0.15896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3877827040335886e-06, "loss": 0.0, "num_tokens": 85726242.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0692628026008606, "epoch": 0.15904, "grad_norm": 0.0, "learning_rate": 3.3876195279557197e-06, "loss": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.015625, "completions/mean_terminated_length": 206.09999084472656, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07897326722741127, "epoch": 0.15912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.38745623726178e-06, "loss": 0.0, "num_tokens": 85819556.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07687900587916374, "epoch": 0.1592, "grad_norm": 0.0, "learning_rate": 3.3872928319631985e-06, "loss": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.265625, "completions/mean_terminated_length": 214.47760009765625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07306177914142609, "epoch": 0.15928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.387129312071412e-06, "loss": 0.0, "num_tokens": 85915078.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0786157064139843, "epoch": 0.15936, "grad_norm": 0.0, "learning_rate": 3.3869656775978645e-06, "loss": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.8671875, "completions/mean_terminated_length": 212.03797912597656, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07912987470626831, "epoch": 0.15944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3868019285540092e-06, "loss": 0.0, "num_tokens": 86009909.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0782947689294815, "epoch": 0.15952, "grad_norm": 0.0, "learning_rate": 3.3866380649513073e-06, "loss": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.1640625, "completions/mean_terminated_length": 218.76416015625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06053239107131958, "epoch": 0.1596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.386474086801227e-06, "loss": 0.0, "num_tokens": 86104266.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06444455683231354, "epoch": 0.15968, "grad_norm": 0.0, "learning_rate": 3.3863099941152445e-06, "loss": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.1328125, "completions/mean_terminated_length": 217.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06811094656586647, "epoch": 0.15976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.386145786904846e-06, "loss": 0.0, "num_tokens": 86199515.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07151955366134644, "epoch": 0.15984, "grad_norm": 0.0, "learning_rate": 3.385981465181523e-06, "loss": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.5546875, "completions/mean_terminated_length": 215.109375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07065301388502121, "epoch": 0.15992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.385817028956776e-06, "loss": 0.0, "num_tokens": 86295202.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06743162870407104, "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 3.3856524782421153e-06, "loss": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 237.74647521972656, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.07303757220506668, "epoch": 0.16008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3854878130490565e-06, "loss": 0.0, "num_tokens": 86392210.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07092196494340897, "epoch": 0.16016, "grad_norm": 0.0, "learning_rate": 3.385323033389125e-06, "loss": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.90625, "completions/mean_terminated_length": 234.08509826660156, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07299964129924774, "epoch": 0.16024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.385158139273853e-06, "loss": 0.0, "num_tokens": 86488454.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07100170850753784, "epoch": 0.16032, "grad_norm": 0.0, "learning_rate": 3.3849931307147824e-06, "loss": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 237.65625, "completions/mean_terminated_length": 227.01234436035156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06628578528761864, "epoch": 0.1604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3848280077234612e-06, "loss": 0.0, "num_tokens": 86584410.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06513234600424767, "epoch": 0.16048, "grad_norm": 0.0, "learning_rate": 3.3846627703114466e-06, "loss": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.40625, "completions/mean_terminated_length": 215.09805297851562, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06856600567698479, "epoch": 0.16056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3844974184903026e-06, "loss": 0.0, "num_tokens": 86678542.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06827784329652786, "epoch": 0.16064, "grad_norm": 0.0, "learning_rate": 3.384331952271604e-06, "loss": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.5390625, "completions/mean_terminated_length": 237.2353057861328, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06769579648971558, "epoch": 0.16072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.38416637166693e-06, "loss": 0.0, "num_tokens": 86775251.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06972775980830193, "epoch": 0.1608, "grad_norm": 0.0, "learning_rate": 3.38400067668787e-06, "loss": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.1171875, "completions/mean_terminated_length": 235.04122924804688, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07786223292350769, "epoch": 0.16088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.383834867346021e-06, "loss": 0.0, "num_tokens": 86871522.0, "reward": 0.11487460136413574, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11487460136413574, "rewards/reward_fn/std": 0.2104565054178238, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07896436005830765, "epoch": 0.16096, "grad_norm": 0.0, "learning_rate": 3.3836689436529883e-06, "loss": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.0390625, "completions/mean_terminated_length": 207.6707305908203, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06695502251386642, "epoch": 0.16104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3835029056203836e-06, "loss": 0.0, "num_tokens": 86965863.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06443862989544868, "epoch": 0.16112, "grad_norm": 0.0, "learning_rate": 3.3833367532598287e-06, "loss": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.2421875, "completions/mean_terminated_length": 206.6527862548828, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0733562707901001, "epoch": 0.1612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3831704865829526e-06, "loss": 0.0, "num_tokens": 87060614.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0744607225060463, "epoch": 0.16128, "grad_norm": 0.0, "learning_rate": 3.383004105601392e-06, "loss": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.4296875, "completions/mean_terminated_length": 238.58731079101562, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06828498840332031, "epoch": 0.16136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3828376103267915e-06, "loss": 0.0, "num_tokens": 87157821.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06613628193736076, "epoch": 0.16144, "grad_norm": 0.0, "learning_rate": 3.3826710007708047e-06, "loss": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5234375, "completions/mean_terminated_length": 216.38043212890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07487946003675461, "epoch": 0.16152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3825042769450923e-06, "loss": 0.0, "num_tokens": 87252480.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06819252669811249, "epoch": 0.1616, "grad_norm": 0.0, "learning_rate": 3.3823374388613227e-06, "loss": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.484375, "completions/mean_terminated_length": 212.11236572265625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06820625811815262, "epoch": 0.16168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3821704865311725e-06, "loss": 0.0, "num_tokens": 87346878.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06631472334265709, "epoch": 0.16176, "grad_norm": 0.0, "learning_rate": 3.3820034199663284e-06, "loss": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.203125, "completions/mean_terminated_length": 195.94232177734375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07024501264095306, "epoch": 0.16184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.381836239178481e-06, "loss": 0.0, "num_tokens": 87438936.0, "reward": 0.4472954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4472954273223877, "rewards/reward_fn/std": 0.9870926737785339, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06565533950924873, "epoch": 0.16192, "grad_norm": 0.0, "learning_rate": 3.381668944179333e-06, "loss": 0.0, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.09375, "completions/mean_terminated_length": 200.52525329589844, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06757788360118866, "epoch": 0.162, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3815015349805927e-06, "loss": 0.0, "num_tokens": 87531748.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0685306079685688, "epoch": 0.16208, "grad_norm": 0.0, "learning_rate": 3.3813340115939763e-06, "loss": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.34375, "completions/mean_terminated_length": 237.8235321044922, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06743402034044266, "epoch": 0.16216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3811663740312095e-06, "loss": 0.0, "num_tokens": 87628816.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06882448121905327, "epoch": 0.16224, "grad_norm": 0.0, "learning_rate": 3.3809986223040253e-06, "loss": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.4765625, "completions/mean_terminated_length": 223.2386474609375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07470296695828438, "epoch": 0.16232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3808307564241637e-06, "loss": 0.0, "num_tokens": 87724237.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.073318712413311, "epoch": 0.1624, "grad_norm": 0.0, "learning_rate": 3.3806627764033738e-06, "loss": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.8671875, "completions/mean_terminated_length": 228.3285675048828, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06526872515678406, "epoch": 0.16248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.380494682253413e-06, "loss": 0.0, "num_tokens": 87820604.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06459398567676544, "epoch": 0.16256, "grad_norm": 0.0, "learning_rate": 3.3803264739860453e-06, "loss": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.8515625, "completions/mean_terminated_length": 223.07778930664062, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07533745840191841, "epoch": 0.16264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.380158151613044e-06, "loss": 0.0, "num_tokens": 87915945.0, "reward": 0.9481821060180664, "reward_std": 0.0, "rewards/reward_fn/mean": 0.9481821060180664, "rewards/reward_fn/std": 1.2328767776489258, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07636276260018349, "epoch": 0.16272, "grad_norm": 0.0, "learning_rate": 3.3799897151461897e-06, "loss": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.4453125, "completions/mean_terminated_length": 222.0865478515625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07054631412029266, "epoch": 0.1628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3798211645972717e-06, "loss": 0.0, "num_tokens": 88010722.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07291663065552711, "epoch": 0.16288, "grad_norm": 0.0, "learning_rate": 3.3796524999780865e-06, "loss": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.796875, "completions/mean_terminated_length": 240.24176025390625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.0667511448264122, "epoch": 0.16296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3794837213004385e-06, "loss": 0.0, "num_tokens": 88107592.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06131202541291714, "epoch": 0.16304, "grad_norm": 0.0, "learning_rate": 3.3793148285761404e-06, "loss": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.046875, "completions/mean_terminated_length": 214.39535522460938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.0717732347548008, "epoch": 0.16312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3791458218170133e-06, "loss": 0.0, "num_tokens": 88202318.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06984762847423553, "epoch": 0.1632, "grad_norm": 0.0, "learning_rate": 3.3789767010348853e-06, "loss": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 222.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.06764241307973862, "epoch": 0.16328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.378807466241594e-06, "loss": 0.0, "num_tokens": 88298412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 1.3280736207962036, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06774694845080376, "epoch": 0.16336, "grad_norm": 0.0, "learning_rate": 3.3786381174489836e-06, "loss": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.515625, "completions/mean_terminated_length": 239.84616088867188, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.0673811174929142, "epoch": 0.16344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.378468654668906e-06, "loss": 0.0, "num_tokens": 88395246.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06879408285021782, "epoch": 0.16352, "grad_norm": 0.0, "learning_rate": 3.3782990779132233e-06, "loss": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.1484375, "completions/mean_terminated_length": 221.66217041015625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06636777147650719, "epoch": 0.1636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3781293871938023e-06, "loss": 0.0, "num_tokens": 88491009.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06890271231532097, "epoch": 0.16368, "grad_norm": 0.0, "learning_rate": 3.3779595825225213e-06, "loss": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.7578125, "completions/mean_terminated_length": 239.4050750732422, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.0729549415409565, "epoch": 0.16376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3777896639112634e-06, "loss": 0.0, "num_tokens": 88588002.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07050331309437752, "epoch": 0.16384, "grad_norm": 0.0, "learning_rate": 3.3776196313719222e-06, "loss": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.1015625, "completions/mean_terminated_length": 217.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07397502660751343, "epoch": 0.16392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3774494849163973e-06, "loss": 0.0, "num_tokens": 88682607.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07123211026191711, "epoch": 0.164, "grad_norm": 0.0, "learning_rate": 3.377279224556598e-06, "loss": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.9609375, "completions/mean_terminated_length": 209.40403747558594, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07286249473690987, "epoch": 0.16408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3771088503044397e-06, "loss": 0.0, "num_tokens": 88776298.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07526427507400513, "epoch": 0.16416, "grad_norm": 0.0, "learning_rate": 3.376938362171848e-06, "loss": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.90625, "completions/mean_terminated_length": 237.3157958984375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06776215136051178, "epoch": 0.16424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3767677601707546e-06, "loss": 0.0, "num_tokens": 88873182.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06913650408387184, "epoch": 0.16432, "grad_norm": 0.0, "learning_rate": 3.3765970443130997e-06, "loss": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.9765625, "completions/mean_terminated_length": 197.36143493652344, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.05844376981258392, "epoch": 0.1644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3764262146108314e-06, "loss": 0.0, "num_tokens": 88966619.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05903799459338188, "epoch": 0.16448, "grad_norm": 0.0, "learning_rate": 3.376255271075907e-06, "loss": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.5703125, "completions/mean_terminated_length": 236.01162719726562, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.0752185620367527, "epoch": 0.16456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3760842137202905e-06, "loss": 0.0, "num_tokens": 89063204.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0742926336824894, "epoch": 0.16464, "grad_norm": 0.0, "learning_rate": 3.3759130425559533e-06, "loss": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.0390625, "completions/mean_terminated_length": 215.64474487304688, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07113154605031013, "epoch": 0.16472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3757417575948754e-06, "loss": 0.0, "num_tokens": 89158441.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07198642939329147, "epoch": 0.1648, "grad_norm": 0.0, "learning_rate": 3.375570358849046e-06, "loss": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.6484375, "completions/mean_terminated_length": 220.4166717529297, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06620775535702705, "epoch": 0.16488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.375398846330461e-06, "loss": 0.0, "num_tokens": 89253756.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06601006537675858, "epoch": 0.16496, "grad_norm": 0.0, "learning_rate": 3.3752272200511243e-06, "loss": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.65625, "completions/mean_terminated_length": 212.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07284937426447868, "epoch": 0.16504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3750554800230473e-06, "loss": 0.0, "num_tokens": 89348560.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07367869839072227, "epoch": 0.16512, "grad_norm": 0.0, "learning_rate": 3.374883626258251e-06, "loss": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.921875, "completions/mean_terminated_length": 218.9484405517578, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07477959245443344, "epoch": 0.1652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3747116587687622e-06, "loss": 0.0, "num_tokens": 89443270.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07610460370779037, "epoch": 0.16528, "grad_norm": 0.0, "learning_rate": 3.374539577566618e-06, "loss": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.53125, "completions/mean_terminated_length": 188.4888916015625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07534440979361534, "epoch": 0.16536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.374367382663862e-06, "loss": 0.0, "num_tokens": 89535498.0, "reward": 0.7749170064926147, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7749170064926147, "rewards/reward_fn/std": 1.2903467416763306, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07485086098313332, "epoch": 0.16544, "grad_norm": 0.0, "learning_rate": 3.3741950740725454e-06, "loss": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 243.5546875, "completions/mean_terminated_length": 233.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.062304817140102386, "epoch": 0.16552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3740226518047282e-06, "loss": 0.0, "num_tokens": 89632209.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06804140657186508, "epoch": 0.1656, "grad_norm": 0.0, "learning_rate": 3.3738501158724787e-06, "loss": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2109375, "completions/mean_terminated_length": 238.0357208251953, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07257286459207535, "epoch": 0.16568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.373677466287872e-06, "loss": 0.0, "num_tokens": 89729004.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07173364236950874, "epoch": 0.16576, "grad_norm": 0.0, "learning_rate": 3.373504703062992e-06, "loss": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.3046875, "completions/mean_terminated_length": 211.62338256835938, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07485271990299225, "epoch": 0.16584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.37333182620993e-06, "loss": 0.0, "num_tokens": 89823891.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06893808767199516, "epoch": 0.16592, "grad_norm": 0.0, "learning_rate": 3.3731588357407862e-06, "loss": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.0625, "completions/mean_terminated_length": 237.01492309570312, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07260312139987946, "epoch": 0.166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3729857316676675e-06, "loss": 0.0, "num_tokens": 89920923.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0727185569703579, "epoch": 0.16608, "grad_norm": 0.0, "learning_rate": 3.3728125140026893e-06, "loss": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.296875, "completions/mean_terminated_length": 239.21621704101562, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07339129969477654, "epoch": 0.16616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.372639182757976e-06, "loss": 0.0, "num_tokens": 90017985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07384423911571503, "epoch": 0.16624, "grad_norm": 0.0, "learning_rate": 3.3724657379456572e-06, "loss": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 214.98851013183594, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06887790933251381, "epoch": 0.16632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.372292179577874e-06, "loss": 0.0, "num_tokens": 90112721.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06880238279700279, "epoch": 0.1664, "grad_norm": 0.0, "learning_rate": 3.3721185076667727e-06, "loss": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.3828125, "completions/mean_terminated_length": 202.12643432617188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07022137939929962, "epoch": 0.16648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3719447222245083e-06, "loss": 0.0, "num_tokens": 90206338.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07061386853456497, "epoch": 0.16656, "grad_norm": 0.0, "learning_rate": 3.371770823263245e-06, "loss": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.40625, "completions/mean_terminated_length": 213.9310302734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06708525121212006, "epoch": 0.16664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3715968107951523e-06, "loss": 0.0, "num_tokens": 90300982.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674309954047203, "epoch": 0.16672, "grad_norm": 0.0, "learning_rate": 3.3714226848324108e-06, "loss": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 239.609375, "completions/mean_terminated_length": 226.02857971191406, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06482011452317238, "epoch": 0.1668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3712484453872066e-06, "loss": 0.0, "num_tokens": 90397188.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06771284714341164, "epoch": 0.16688, "grad_norm": 0.0, "learning_rate": 3.371074092471735e-06, "loss": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 235.9140625, "completions/mean_terminated_length": 220.2916717529297, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07103165239095688, "epoch": 0.16696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3708996260981986e-06, "loss": 0.0, "num_tokens": 90492921.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07196439802646637, "epoch": 0.16704, "grad_norm": 0.0, "learning_rate": 3.3707250462788083e-06, "loss": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.3671875, "completions/mean_terminated_length": 212.91209411621094, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06602561473846436, "epoch": 0.16712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3705503530257826e-06, "loss": 0.0, "num_tokens": 90587304.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0687122792005539, "epoch": 0.1672, "grad_norm": 0.0, "learning_rate": 3.370375546351349e-06, "loss": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.96875, "completions/mean_terminated_length": 233.20001220703125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.06872621551156044, "epoch": 0.16728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3702006262677406e-06, "loss": 0.0, "num_tokens": 90683556.0, "reward": 0.11994174867868423, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11994174867868423, "rewards/reward_fn/std": 0.3185829222202301, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06586095318198204, "epoch": 0.16736, "grad_norm": 0.0, "learning_rate": 3.370025592787202e-06, "loss": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 230.109375, "completions/mean_terminated_length": 209.97222900390625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06999478861689568, "epoch": 0.16744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.369850445921982e-06, "loss": 0.0, "num_tokens": 90778546.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07037695124745369, "epoch": 0.16752, "grad_norm": 0.0, "learning_rate": 3.36967518568434e-06, "loss": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6328125, "completions/mean_terminated_length": 212.6212158203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0765252411365509, "epoch": 0.1676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.369499812086542e-06, "loss": 0.0, "num_tokens": 90873987.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07716191187500954, "epoch": 0.16768, "grad_norm": 0.0, "learning_rate": 3.3693243251408617e-06, "loss": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 209.328125, "completions/mean_terminated_length": 181.3249969482422, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07000748440623283, "epoch": 0.16776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3691487248595824e-06, "loss": 0.0, "num_tokens": 90966317.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06868317350745201, "epoch": 0.16784, "grad_norm": 0.0, "learning_rate": 3.368973011254994e-06, "loss": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.3671875, "completions/mean_terminated_length": 241.362060546875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.06448943540453911, "epoch": 0.16792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3687971843393944e-06, "loss": 0.0, "num_tokens": 91063772.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06591064110398293, "epoch": 0.168, "grad_norm": 0.0, "learning_rate": 3.368621244125089e-06, "loss": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 201.984375, "completions/mean_terminated_length": 181.65591430664062, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07111629098653793, "epoch": 0.16808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.368445190624393e-06, "loss": 0.0, "num_tokens": 91155162.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06715728342533112, "epoch": 0.16816, "grad_norm": 0.0, "learning_rate": 3.3682690238496274e-06, "loss": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.9609375, "completions/mean_terminated_length": 225.09637451171875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07460172474384308, "epoch": 0.16824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3680927438131224e-06, "loss": 0.0, "num_tokens": 91250901.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07237908244132996, "epoch": 0.16832, "grad_norm": 0.0, "learning_rate": 3.3679163505272155e-06, "loss": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.96875, "completions/mean_terminated_length": 240.7894744873047, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06888929009437561, "epoch": 0.1684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.367739844004252e-06, "loss": 0.0, "num_tokens": 91348049.0, "reward": 0.11557802557945251, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11557802557945251, "rewards/reward_fn/std": 0.30699223279953003, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0701800025999546, "epoch": 0.16848, "grad_norm": 0.0, "learning_rate": 3.3675632242565868e-06, "loss": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.7734375, "completions/mean_terminated_length": 189.76766967773438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07014354690909386, "epoch": 0.16856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3673864912965805e-06, "loss": 0.0, "num_tokens": 91439796.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06692549586296082, "epoch": 0.16864, "grad_norm": 0.0, "learning_rate": 3.367209645136602e-06, "loss": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.9609375, "completions/mean_terminated_length": 211.21685791015625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.061801088973879814, "epoch": 0.16872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3670326857890302e-06, "loss": 0.0, "num_tokens": 91534383.0, "reward": 0.7549973726272583, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7549973726272583, "rewards/reward_fn/std": 1.3013103008270264, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06176443211734295, "epoch": 0.1688, "grad_norm": 0.0, "learning_rate": 3.366855613266249e-06, "loss": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.1484375, "completions/mean_terminated_length": 224.84693908691406, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06469385512173176, "epoch": 0.16888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3666784275806526e-06, "loss": 0.0, "num_tokens": 91629634.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06871993467211723, "epoch": 0.16896, "grad_norm": 0.0, "learning_rate": 3.366501128744641e-06, "loss": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.921875, "completions/mean_terminated_length": 219.10345458984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06906973570585251, "epoch": 0.16904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.366323716770624e-06, "loss": 0.0, "num_tokens": 91724728.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06911235675215721, "epoch": 0.16912, "grad_norm": 0.0, "learning_rate": 3.366146191671018e-06, "loss": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.3125, "completions/mean_terminated_length": 199.3103485107422, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.07032880187034607, "epoch": 0.1692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.365968553458249e-06, "loss": 0.0, "num_tokens": 91819744.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06924130395054817, "epoch": 0.16928, "grad_norm": 0.0, "learning_rate": 3.3657908021447485e-06, "loss": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.15625, "completions/mean_terminated_length": 191.55555725097656, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07618816196918488, "epoch": 0.16936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3656129377429587e-06, "loss": 0.0, "num_tokens": 91911668.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07816991209983826, "epoch": 0.16944, "grad_norm": 0.0, "learning_rate": 3.365434960265327e-06, "loss": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.765625, "completions/mean_terminated_length": 206.48077392578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07003950327634811, "epoch": 0.16952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.36525686972431e-06, "loss": 0.0, "num_tokens": 92004822.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06880950182676315, "epoch": 0.1696, "grad_norm": 0.0, "learning_rate": 3.3650786661323727e-06, "loss": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.828125, "completions/mean_terminated_length": 213.58535766601562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07523000240325928, "epoch": 0.16968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.364900349501987e-06, "loss": 0.0, "num_tokens": 92099648.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07880717515945435, "epoch": 0.16976, "grad_norm": 0.0, "learning_rate": 3.3647219198456334e-06, "loss": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.2109375, "completions/mean_terminated_length": 224.29348754882812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06987670063972473, "epoch": 0.16984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3645433771758003e-06, "loss": 0.0, "num_tokens": 92195035.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07299726456403732, "epoch": 0.16992, "grad_norm": 0.0, "learning_rate": 3.3643647215049834e-06, "loss": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.109375, "completions/mean_terminated_length": 237.65789794921875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06733916699886322, "epoch": 0.17, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.364185952845687e-06, "loss": 0.0, "num_tokens": 92291945.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06882385537028313, "epoch": 0.17008, "grad_norm": 0.0, "learning_rate": 3.364007071210423e-06, "loss": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.9921875, "completions/mean_terminated_length": 219.41270446777344, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06693575903773308, "epoch": 0.17016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.363828076611711e-06, "loss": 0.0, "num_tokens": 92387944.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0677955113351345, "epoch": 0.17024, "grad_norm": 0.0, "learning_rate": 3.3636489690620794e-06, "loss": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.640625, "completions/mean_terminated_length": 212.56668090820312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0697944425046444, "epoch": 0.17032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.363469748574063e-06, "loss": 0.0, "num_tokens": 92483642.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07051289826631546, "epoch": 0.1704, "grad_norm": 0.0, "learning_rate": 3.3632904151602054e-06, "loss": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.6640625, "completions/mean_terminated_length": 218.4499969482422, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06363129802048206, "epoch": 0.17048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.363110968833058e-06, "loss": 0.0, "num_tokens": 92578191.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060909856110811234, "epoch": 0.17056, "grad_norm": 0.0, "learning_rate": 3.3629314096051813e-06, "loss": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.8984375, "completions/mean_terminated_length": 209.43955993652344, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07024603709578514, "epoch": 0.17064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3627517374891416e-06, "loss": 0.0, "num_tokens": 92672258.0, "reward": 0.09519927203655243, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09519927203655243, "rewards/reward_fn/std": 0.2528632879257202, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06701686233282089, "epoch": 0.17072, "grad_norm": 0.0, "learning_rate": 3.362571952497514e-06, "loss": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 182.06060791015625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06758534535765648, "epoch": 0.1708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3623920546428815e-06, "loss": 0.0, "num_tokens": 92765682.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06640054285526276, "epoch": 0.17088, "grad_norm": 0.0, "learning_rate": 3.3622120439378352e-06, "loss": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.71875, "completions/mean_terminated_length": 194.44444274902344, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06600243598222733, "epoch": 0.17096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3620319203949743e-06, "loss": 0.0, "num_tokens": 92858446.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06447144597768784, "epoch": 0.17104, "grad_norm": 0.0, "learning_rate": 3.3618516840269047e-06, "loss": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.859375, "completions/mean_terminated_length": 211.4081573486328, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.06491562724113464, "epoch": 0.17112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3616713348462426e-06, "loss": 0.0, "num_tokens": 92952380.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07090241834521294, "epoch": 0.1712, "grad_norm": 0.0, "learning_rate": 3.361490872865609e-06, "loss": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 223.1325225830078, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06868863105773926, "epoch": 0.17128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3613102980976347e-06, "loss": 0.0, "num_tokens": 93047956.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07186181470751762, "epoch": 0.17136, "grad_norm": 0.0, "learning_rate": 3.361129610554958e-06, "loss": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 181.6565704345703, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.0784594714641571, "epoch": 0.17144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.360948810250226e-06, "loss": 0.0, "num_tokens": 93138900.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07403632253408432, "epoch": 0.17152, "grad_norm": 0.0, "learning_rate": 3.360767897196091e-06, "loss": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.8046875, "completions/mean_terminated_length": 233.1666717529297, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06472373008728027, "epoch": 0.1716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.360586871405217e-06, "loss": 0.0, "num_tokens": 93234875.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06547341495752335, "epoch": 0.17168, "grad_norm": 0.0, "learning_rate": 3.3604057328902732e-06, "loss": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7421875, "completions/mean_terminated_length": 234.05618286132812, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06495152786374092, "epoch": 0.17176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.360224481663937e-06, "loss": 0.0, "num_tokens": 93331226.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06919114664196968, "epoch": 0.17184, "grad_norm": 0.0, "learning_rate": 3.360043117738894e-06, "loss": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.109375, "completions/mean_terminated_length": 235.28260803222656, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06531337276101112, "epoch": 0.17192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.359861641127838e-06, "loss": 0.0, "num_tokens": 93427624.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06564783677458763, "epoch": 0.172, "grad_norm": 0.0, "learning_rate": 3.3596800518434703e-06, "loss": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.921875, "completions/mean_terminated_length": 216.37037658691406, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06377125903964043, "epoch": 0.17208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3594983498985012e-06, "loss": 0.0, "num_tokens": 93522718.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0689004510641098, "epoch": 0.17216, "grad_norm": 0.0, "learning_rate": 3.3593165353056465e-06, "loss": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.703125, "completions/mean_terminated_length": 194.0533447265625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06899545714259148, "epoch": 0.17224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3591346080776318e-06, "loss": 0.0, "num_tokens": 93616376.0, "reward": 0.8675283193588257, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8675283193588257, "rewards/reward_fn/std": 1.273011565208435, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06837121769785881, "epoch": 0.17232, "grad_norm": 0.0, "learning_rate": 3.3589525682271906e-06, "loss": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.1171875, "completions/mean_terminated_length": 202.1267547607422, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06861043721437454, "epoch": 0.1724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3587704157670636e-06, "loss": 0.0, "num_tokens": 93710855.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06667402386665344, "epoch": 0.17248, "grad_norm": 0.0, "learning_rate": 3.3585881507099986e-06, "loss": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.046875, "completions/mean_terminated_length": 242.5894775390625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07248855382204056, "epoch": 0.17256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3584057730687532e-06, "loss": 0.0, "num_tokens": 93807885.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07114940136671066, "epoch": 0.17264, "grad_norm": 0.0, "learning_rate": 3.3582232828560914e-06, "loss": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.09375, "completions/mean_terminated_length": 232.5977020263672, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06503454595804214, "epoch": 0.17272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3580406800847868e-06, "loss": 0.0, "num_tokens": 93904153.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06550344824790955, "epoch": 0.1728, "grad_norm": 0.0, "learning_rate": 3.3578579647676177e-06, "loss": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.453125, "completions/mean_terminated_length": 239.7066650390625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07130148634314537, "epoch": 0.17288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.357675136917374e-06, "loss": 0.0, "num_tokens": 94001235.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06923921033740044, "epoch": 0.17296, "grad_norm": 0.0, "learning_rate": 3.357492196546851e-06, "loss": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.7578125, "completions/mean_terminated_length": 211.3275909423828, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.061464279890060425, "epoch": 0.17304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3573091436688518e-06, "loss": 0.0, "num_tokens": 94096948.0, "reward": 0.1370438039302826, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1370438039302826, "rewards/reward_fn/std": 0.3278296887874603, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0596650093793869, "epoch": 0.17312, "grad_norm": 0.0, "learning_rate": 3.3571259782961896e-06, "loss": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.890625, "completions/mean_terminated_length": 216.77896118164062, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06797152385115623, "epoch": 0.1732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.356942700441683e-06, "loss": 0.0, "num_tokens": 94191526.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06840318441390991, "epoch": 0.17328, "grad_norm": 0.0, "learning_rate": 3.3567593101181596e-06, "loss": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.609375, "completions/mean_terminated_length": 222.10389709472656, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07368200644850731, "epoch": 0.17336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.356575807338456e-06, "loss": 0.0, "num_tokens": 94287220.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07437816634774208, "epoch": 0.17344, "grad_norm": 0.0, "learning_rate": 3.356392192115414e-06, "loss": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.953125, "completions/mean_terminated_length": 234.83517456054688, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07234779745340347, "epoch": 0.17352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3562084644618853e-06, "loss": 0.0, "num_tokens": 94383598.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07355298101902008, "epoch": 0.1736, "grad_norm": 0.0, "learning_rate": 3.3560246243907288e-06, "loss": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 205.8828125, "completions/mean_terminated_length": 195.48114013671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06435573473572731, "epoch": 0.17368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3558406719148116e-06, "loss": 0.0, "num_tokens": 94475487.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06940837576985359, "epoch": 0.17376, "grad_norm": 0.0, "learning_rate": 3.355656607047008e-06, "loss": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.5078125, "completions/mean_terminated_length": 211.0595245361328, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07446900010108948, "epoch": 0.17384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3554724298002006e-06, "loss": 0.0, "num_tokens": 94570016.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07684371620416641, "epoch": 0.17392, "grad_norm": 0.0, "learning_rate": 3.3552881401872807e-06, "loss": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.8671875, "completions/mean_terminated_length": 244.4558868408203, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.07206225022673607, "epoch": 0.174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.355103738221146e-06, "loss": 0.0, "num_tokens": 94667535.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07126401737332344, "epoch": 0.17408, "grad_norm": 0.0, "learning_rate": 3.3549192239147022e-06, "loss": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 223.0859375, "completions/mean_terminated_length": 201.9871826171875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07015961781144142, "epoch": 0.17416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3547345972808646e-06, "loss": 0.0, "num_tokens": 94761626.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06776939332485199, "epoch": 0.17424, "grad_norm": 0.0, "learning_rate": 3.354549858332554e-06, "loss": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.578125, "completions/mean_terminated_length": 225.13978576660156, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07190212607383728, "epoch": 0.17432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3543650070827003e-06, "loss": 0.0, "num_tokens": 94857060.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07264028862118721, "epoch": 0.1744, "grad_norm": 0.0, "learning_rate": 3.3541800435442417e-06, "loss": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.3203125, "completions/mean_terminated_length": 224.10842895507812, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07339531928300858, "epoch": 0.17448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3539949677301234e-06, "loss": 0.0, "num_tokens": 94952717.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07060198113322258, "epoch": 0.17456, "grad_norm": 0.0, "learning_rate": 3.3538097796532987e-06, "loss": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 189.78125, "completions/mean_terminated_length": 182.29563903808594, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07036738470196724, "epoch": 0.17464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.353624479326729e-06, "loss": 0.0, "num_tokens": 95042545.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06414381414651871, "epoch": 0.17472, "grad_norm": 0.0, "learning_rate": 3.3534390667633834e-06, "loss": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.2578125, "completions/mean_terminated_length": 183.0533447265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06344977393746376, "epoch": 0.1748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.353253541976238e-06, "loss": 0.0, "num_tokens": 95135378.0, "reward": 1.2180346250534058, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2180346250534058, "rewards/reward_fn/std": 1.4057286977767944, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06742238625884056, "epoch": 0.17488, "grad_norm": 0.0, "learning_rate": 3.3530679049782783e-06, "loss": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.3203125, "completions/mean_terminated_length": 211.0531005859375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06632799655199051, "epoch": 0.17496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3528821557824974e-06, "loss": 0.0, "num_tokens": 95228603.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06593495607376099, "epoch": 0.17504, "grad_norm": 0.0, "learning_rate": 3.352696294401895e-06, "loss": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.3984375, "completions/mean_terminated_length": 233.0459747314453, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06096465699374676, "epoch": 0.17512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3525103208494794e-06, "loss": 0.0, "num_tokens": 95324910.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0629868172109127, "epoch": 0.1752, "grad_norm": 0.0, "learning_rate": 3.352324235138267e-06, "loss": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.265625, "completions/mean_terminated_length": 212.6835479736328, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06857365369796753, "epoch": 0.17528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.352138037281282e-06, "loss": 0.0, "num_tokens": 95419792.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07037734240293503, "epoch": 0.17536, "grad_norm": 0.0, "learning_rate": 3.3519517272915565e-06, "loss": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.9609375, "completions/mean_terminated_length": 204.73333740234375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07712192088365555, "epoch": 0.17544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.35176530518213e-06, "loss": 0.0, "num_tokens": 95514251.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07282429933547974, "epoch": 0.17552, "grad_norm": 0.0, "learning_rate": 3.3515787709660496e-06, "loss": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.2734375, "completions/mean_terminated_length": 235.08334350585938, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07346059009432793, "epoch": 0.1756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.351392124656371e-06, "loss": 0.0, "num_tokens": 95610798.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07656803727149963, "epoch": 0.17568, "grad_norm": 0.0, "learning_rate": 3.351205366266158e-06, "loss": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.140625, "completions/mean_terminated_length": 209.84762573242188, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07563304901123047, "epoch": 0.17576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.351018495808481e-06, "loss": 0.0, "num_tokens": 95704256.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0723971351981163, "epoch": 0.17584, "grad_norm": 0.0, "learning_rate": 3.3508315132964196e-06, "loss": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.9921875, "completions/mean_terminated_length": 208.30392456054688, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07465501129627228, "epoch": 0.17592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.35064441874306e-06, "loss": 0.0, "num_tokens": 95797695.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07878351956605911, "epoch": 0.176, "grad_norm": 0.0, "learning_rate": 3.3504572121614963e-06, "loss": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5390625, "completions/mean_terminated_length": 236.17567443847656, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06825370341539383, "epoch": 0.17608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3502698935648324e-06, "loss": 0.0, "num_tokens": 95894532.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06821465864777565, "epoch": 0.17616, "grad_norm": 0.0, "learning_rate": 3.3500824629661787e-06, "loss": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.3828125, "completions/mean_terminated_length": 209.73118591308594, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07261526957154274, "epoch": 0.17624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3498949203786518e-06, "loss": 0.0, "num_tokens": 95988533.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06932543963193893, "epoch": 0.17632, "grad_norm": 0.0, "learning_rate": 3.3497072658153788e-06, "loss": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.046875, "completions/mean_terminated_length": 225.1724090576172, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.0854736790060997, "epoch": 0.1764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3495194992894934e-06, "loss": 0.0, "num_tokens": 96084155.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08468625694513321, "epoch": 0.17648, "grad_norm": 0.0, "learning_rate": 3.3493316208141365e-06, "loss": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.0625, "completions/mean_terminated_length": 199.4947509765625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07198421657085419, "epoch": 0.17656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.349143630402459e-06, "loss": 0.0, "num_tokens": 96177091.0, "reward": 0.15653178095817566, "reward_std": 0.0, "rewards/reward_fn/mean": 0.15653178095817566, "rewards/reward_fn/std": 0.27857261896133423, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0735948383808136, "epoch": 0.17664, "grad_norm": 0.0, "learning_rate": 3.3489555280676174e-06, "loss": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.3125, "completions/mean_terminated_length": 239.40298461914062, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07559384033083916, "epoch": 0.17672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3487673138227772e-06, "loss": 0.0, "num_tokens": 96274283.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07490692660212517, "epoch": 0.1768, "grad_norm": 0.0, "learning_rate": 3.3485789876811108e-06, "loss": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.9609375, "completions/mean_terminated_length": 236.7375030517578, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.06558989733457565, "epoch": 0.17688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3483905496557995e-06, "loss": 0.0, "num_tokens": 96371046.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0657649114727974, "epoch": 0.17696, "grad_norm": 0.0, "learning_rate": 3.348201999760032e-06, "loss": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.0859375, "completions/mean_terminated_length": 232.04348754882812, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06813374161720276, "epoch": 0.17704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.348013338007005e-06, "loss": 0.0, "num_tokens": 96467697.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06746044009923935, "epoch": 0.17712, "grad_norm": 0.0, "learning_rate": 3.3478245644099217e-06, "loss": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.1796875, "completions/mean_terminated_length": 240.60000610351562, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06343934312462807, "epoch": 0.1772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.347635678981996e-06, "loss": 0.0, "num_tokens": 96565000.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06316492147743702, "epoch": 0.17728, "grad_norm": 0.0, "learning_rate": 3.347446681736447e-06, "loss": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.9296875, "completions/mean_terminated_length": 216.07778930664062, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06152984872460365, "epoch": 0.17736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.347257572686502e-06, "loss": 0.0, "num_tokens": 96659711.0, "reward": 0.4565883278846741, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4565883278846741, "rewards/reward_fn/std": 0.9886437654495239, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06100852228701115, "epoch": 0.17744, "grad_norm": 0.0, "learning_rate": 3.3470683518453975e-06, "loss": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.90625, "completions/mean_terminated_length": 178.60317993164062, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06295708194375038, "epoch": 0.17752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3468790192263762e-06, "loss": 0.0, "num_tokens": 96753139.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0658697709441185, "epoch": 0.1776, "grad_norm": 0.0, "learning_rate": 3.34668957484269e-06, "loss": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.84375, "completions/mean_terminated_length": 236.5500030517578, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06979602575302124, "epoch": 0.17768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.346500018707598e-06, "loss": 0.0, "num_tokens": 96849887.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06794445589184761, "epoch": 0.17776, "grad_norm": 0.0, "learning_rate": 3.3463103508343665e-06, "loss": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.1875, "completions/mean_terminated_length": 235.56756591796875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06872374564409256, "epoch": 0.17784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3461205712362707e-06, "loss": 0.0, "num_tokens": 96946679.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0701710656285286, "epoch": 0.17792, "grad_norm": 0.0, "learning_rate": 3.3459306799265935e-06, "loss": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3671875, "completions/mean_terminated_length": 224.47000122070312, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07499092817306519, "epoch": 0.178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3457406769186248e-06, "loss": 0.0, "num_tokens": 97041830.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07493658736348152, "epoch": 0.17808, "grad_norm": 0.0, "learning_rate": 3.345550562225663e-06, "loss": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 198.80555725097656, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06972258165478706, "epoch": 0.17816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3453603358610136e-06, "loss": 0.0, "num_tokens": 97136016.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07054179906845093, "epoch": 0.17824, "grad_norm": 0.0, "learning_rate": 3.3451699978379914e-06, "loss": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.6484375, "completions/mean_terminated_length": 201.89535522460938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06698872894048691, "epoch": 0.17832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3449795481699174e-06, "loss": 0.0, "num_tokens": 97229667.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06378224119544029, "epoch": 0.1784, "grad_norm": 0.0, "learning_rate": 3.344788986870121e-06, "loss": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.8125, "completions/mean_terminated_length": 193.06121826171875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.08136820048093796, "epoch": 0.17848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3445983139519395e-06, "loss": 0.0, "num_tokens": 97321803.0, "reward": 0.4956766963005066, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4956766963005066, "rewards/reward_fn/std": 1.001822590827942, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07750429958105087, "epoch": 0.17856, "grad_norm": 0.0, "learning_rate": 3.3444075294287187e-06, "loss": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.8046875, "completions/mean_terminated_length": 210.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07262161746621132, "epoch": 0.17864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.344216633313811e-06, "loss": 0.0, "num_tokens": 97416114.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07343540340662003, "epoch": 0.17872, "grad_norm": 0.0, "learning_rate": 3.3440256256205763e-06, "loss": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.5625, "completions/mean_terminated_length": 220.87998962402344, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07426069676876068, "epoch": 0.1788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.343834506362385e-06, "loss": 0.0, "num_tokens": 97510906.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07070136442780495, "epoch": 0.17888, "grad_norm": 0.0, "learning_rate": 3.3436432755526115e-06, "loss": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.0390625, "completions/mean_terminated_length": 202.8426971435547, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07039187103509903, "epoch": 0.17896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3434519332046405e-06, "loss": 0.0, "num_tokens": 97604479.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07071125507354736, "epoch": 0.17904, "grad_norm": 0.0, "learning_rate": 3.343260479331865e-06, "loss": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.65625, "completions/mean_terminated_length": 234.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.06635695695877075, "epoch": 0.17912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3430689139476838e-06, "loss": 0.0, "num_tokens": 97701331.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06811018660664558, "epoch": 0.1792, "grad_norm": 0.0, "learning_rate": 3.342877237065504e-06, "loss": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5078125, "completions/mean_terminated_length": 228.7623748779297, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06873642653226852, "epoch": 0.17928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3426854486987424e-06, "loss": 0.0, "num_tokens": 97796884.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668913945555687, "epoch": 0.17936, "grad_norm": 0.0, "learning_rate": 3.342493548860821e-06, "loss": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.5390625, "completions/mean_terminated_length": 209.6027374267578, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.061518266797065735, "epoch": 0.17944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.342301537565171e-06, "loss": 0.0, "num_tokens": 97891801.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06442930735647678, "epoch": 0.17952, "grad_norm": 0.0, "learning_rate": 3.3421094148252314e-06, "loss": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.1484375, "completions/mean_terminated_length": 244.4605255126953, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06606769561767578, "epoch": 0.1796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3419171806544488e-06, "loss": 0.0, "num_tokens": 97989228.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06844442337751389, "epoch": 0.17968, "grad_norm": 0.0, "learning_rate": 3.3417248350662766e-06, "loss": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.28125, "completions/mean_terminated_length": 236.61904907226562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.0676659643650055, "epoch": 0.17976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.341532378074178e-06, "loss": 0.0, "num_tokens": 98085904.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06975670903921127, "epoch": 0.17984, "grad_norm": 0.0, "learning_rate": 3.341339809691623e-06, "loss": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.6171875, "completions/mean_terminated_length": 241.4107208251953, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06480912491679192, "epoch": 0.17992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.341147129932089e-06, "loss": 0.0, "num_tokens": 98183391.0, "reward": 0.051705554127693176, "reward_std": 0.0, "rewards/reward_fn/mean": 0.051705554127693176, "rewards/reward_fn/std": 0.1373375654220581, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06491713598370552, "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 3.3409543388090618e-06, "loss": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.84375, "completions/mean_terminated_length": 220.5500030517578, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07344362139701843, "epoch": 0.18008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.340761436336034e-06, "loss": 0.0, "num_tokens": 98278859.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07567934319376945, "epoch": 0.18016, "grad_norm": 0.0, "learning_rate": 3.340568422526508e-06, "loss": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.171875, "completions/mean_terminated_length": 231.96826171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.07675479725003242, "epoch": 0.18024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3403752973939913e-06, "loss": 0.0, "num_tokens": 98375649.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07574466243386269, "epoch": 0.18032, "grad_norm": 0.0, "learning_rate": 3.3401820609520014e-06, "loss": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 197.8359375, "completions/mean_terminated_length": 171.39773559570312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0707123912870884, "epoch": 0.1804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3399887132140633e-06, "loss": 0.0, "num_tokens": 98466508.0, "reward": 0.07070886343717575, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07070886343717575, "rewards/reward_fn/std": 0.14780206978321075, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06907816976308823, "epoch": 0.18048, "grad_norm": 0.0, "learning_rate": 3.339795254193708e-06, "loss": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 236.0399932861328, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06472354754805565, "epoch": 0.18056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3396016839044766e-06, "loss": 0.0, "num_tokens": 98562816.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06389006972312927, "epoch": 0.18064, "grad_norm": 0.0, "learning_rate": 3.339408002359917e-06, "loss": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.515625, "completions/mean_terminated_length": 203.6565704345703, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07804389670491219, "epoch": 0.18072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.339214209573585e-06, "loss": 0.0, "num_tokens": 98655938.0, "reward": 0.8701362609863281, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8701362609863281, "rewards/reward_fn/std": 1.273180603981018, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07962044700980186, "epoch": 0.1808, "grad_norm": 0.0, "learning_rate": 3.3390203055590426e-06, "loss": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.96875, "completions/mean_terminated_length": 225.05746459960938, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.06666158512234688, "epoch": 0.18088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.338826290329862e-06, "loss": 0.0, "num_tokens": 98751550.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06573881208896637, "epoch": 0.18096, "grad_norm": 0.0, "learning_rate": 3.3386321638996227e-06, "loss": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.96875, "completions/mean_terminated_length": 182.16949462890625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07389524206519127, "epoch": 0.18104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.338437926281911e-06, "loss": 0.0, "num_tokens": 98845498.0, "reward": 0.7794369459152222, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7794369459152222, "rewards/reward_fn/std": 1.2893400192260742, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06812901422381401, "epoch": 0.18112, "grad_norm": 0.0, "learning_rate": 3.3382435774903216e-06, "loss": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.578125, "completions/mean_terminated_length": 229.1775665283203, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.0676744356751442, "epoch": 0.1812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3380491175384566e-06, "loss": 0.0, "num_tokens": 98940932.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0698498785495758, "epoch": 0.18128, "grad_norm": 0.0, "learning_rate": 3.3378545464399264e-06, "loss": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 209.3125, "completions/mean_terminated_length": 193.09474182128906, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06496305763721466, "epoch": 0.18136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.337659864208349e-06, "loss": 0.0, "num_tokens": 99033260.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07051005586981773, "epoch": 0.18144, "grad_norm": 0.0, "learning_rate": 3.3374650708573498e-06, "loss": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.2421875, "completions/mean_terminated_length": 218.9146270751953, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07124688476324081, "epoch": 0.18152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3372701664005625e-06, "loss": 0.0, "num_tokens": 99128523.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07878901809453964, "epoch": 0.1816, "grad_norm": 0.0, "learning_rate": 3.3370751508516286e-06, "loss": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.09375, "completions/mean_terminated_length": 207.58621215820312, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06836150959134102, "epoch": 0.18168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3368800242241962e-06, "loss": 0.0, "num_tokens": 99222615.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07014656811952591, "epoch": 0.18176, "grad_norm": 0.0, "learning_rate": 3.3366847865319234e-06, "loss": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.578125, "completions/mean_terminated_length": 200.1136474609375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07078474387526512, "epoch": 0.18184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.336489437788473e-06, "loss": 0.0, "num_tokens": 99316001.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0716305784881115, "epoch": 0.18192, "grad_norm": 0.0, "learning_rate": 3.3362939780075196e-06, "loss": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.9375, "completions/mean_terminated_length": 234.25350952148438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06814943253993988, "epoch": 0.182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.336098407202742e-06, "loss": 0.0, "num_tokens": 99412761.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06979719549417496, "epoch": 0.18208, "grad_norm": 0.0, "learning_rate": 3.3359027253878277e-06, "loss": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.7421875, "completions/mean_terminated_length": 238.08474731445312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.07181204482913017, "epoch": 0.18216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3357069325764735e-06, "loss": 0.0, "num_tokens": 99510008.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07044494897127151, "epoch": 0.18224, "grad_norm": 0.0, "learning_rate": 3.335511028782382e-06, "loss": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 218.50601196289062, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06443336978554726, "epoch": 0.18232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.335315014019265e-06, "loss": 0.0, "num_tokens": 99605200.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06495154649019241, "epoch": 0.1824, "grad_norm": 0.0, "learning_rate": 3.3351188883008407e-06, "loss": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.5546875, "completions/mean_terminated_length": 240.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06611081585288048, "epoch": 0.18248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.334922651640836e-06, "loss": 0.0, "num_tokens": 99702295.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674954243004322, "epoch": 0.18256, "grad_norm": 0.0, "learning_rate": 3.334726304052986e-06, "loss": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 236.9882354736328, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.070468470454216, "epoch": 0.18264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3345298455510324e-06, "loss": 0.0, "num_tokens": 99798983.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07194381952285767, "epoch": 0.18272, "grad_norm": 0.0, "learning_rate": 3.3343332761487254e-06, "loss": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.203125, "completions/mean_terminated_length": 224.57424926757812, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07659817487001419, "epoch": 0.1828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3341365958598234e-06, "loss": 0.0, "num_tokens": 99894113.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07814202085137367, "epoch": 0.18288, "grad_norm": 0.0, "learning_rate": 3.3339398046980906e-06, "loss": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 230.41026306152344, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06296126544475555, "epoch": 0.18296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3337429026773015e-06, "loss": 0.0, "num_tokens": 99990421.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06465374305844307, "epoch": 0.18304, "grad_norm": 0.0, "learning_rate": 3.3335458898112363e-06, "loss": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.1015625, "completions/mean_terminated_length": 219.8513641357422, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07127439230680466, "epoch": 0.18312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3333487661136845e-06, "loss": 0.0, "num_tokens": 100086050.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07232696563005447, "epoch": 0.1832, "grad_norm": 0.0, "learning_rate": 3.3331515315984425e-06, "loss": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2265625, "completions/mean_terminated_length": 239.25555419921875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.0638839565217495, "epoch": 0.18328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.332954186279314e-06, "loss": 0.0, "num_tokens": 100182847.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06471531093120575, "epoch": 0.18336, "grad_norm": 0.0, "learning_rate": 3.332756730170112e-06, "loss": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.5546875, "completions/mean_terminated_length": 194.4875030517578, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.0765981674194336, "epoch": 0.18344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.332559163284656e-06, "loss": 0.0, "num_tokens": 100276230.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07561139389872551, "epoch": 0.18352, "grad_norm": 0.0, "learning_rate": 3.3323614856367735e-06, "loss": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.921875, "completions/mean_terminated_length": 238.12307739257812, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07121680304408073, "epoch": 0.1836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3321636972403e-06, "loss": 0.0, "num_tokens": 100373372.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07007302343845367, "epoch": 0.18368, "grad_norm": 0.0, "learning_rate": 3.331965798109078e-06, "loss": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.6640625, "completions/mean_terminated_length": 219.186279296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06757111474871635, "epoch": 0.18376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3317677882569594e-06, "loss": 0.0, "num_tokens": 100467921.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06191298924386501, "epoch": 0.18384, "grad_norm": 0.0, "learning_rate": 3.331569667697802e-06, "loss": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.84375, "completions/mean_terminated_length": 239.4285888671875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.07069163024425507, "epoch": 0.18392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3313714364454725e-06, "loss": 0.0, "num_tokens": 100565181.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06991100311279297, "epoch": 0.184, "grad_norm": 0.0, "learning_rate": 3.331173094513845e-06, "loss": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.6953125, "completions/mean_terminated_length": 241.83334350585938, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.07181146740913391, "epoch": 0.18408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.330974641916801e-06, "loss": 0.0, "num_tokens": 100662550.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879210591316223, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07198012992739677, "epoch": 0.18416, "grad_norm": 0.0, "learning_rate": 3.33077607866823e-06, "loss": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.3046875, "completions/mean_terminated_length": 230.22059631347656, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.07045949622988701, "epoch": 0.18424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.33057740478203e-06, "loss": 0.0, "num_tokens": 100759101.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07029416412115097, "epoch": 0.18432, "grad_norm": 0.0, "learning_rate": 3.330378620272106e-06, "loss": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.9375, "completions/mean_terminated_length": 239.13043212890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07727503776550293, "epoch": 0.1844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.33017972515237e-06, "loss": 0.0, "num_tokens": 100856629.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07545911520719528, "epoch": 0.18448, "grad_norm": 0.0, "learning_rate": 3.329980719436744e-06, "loss": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.6328125, "completions/mean_terminated_length": 206.3372039794922, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06773201748728752, "epoch": 0.18456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3297816031391545e-06, "loss": 0.0, "num_tokens": 100950662.0, "reward": 0.4327646493911743, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4327646493911743, "rewards/reward_fn/std": 0.985901951789856, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06416523829102516, "epoch": 0.18464, "grad_norm": 0.0, "learning_rate": 3.3295823762735386e-06, "loss": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.8046875, "completions/mean_terminated_length": 177.27359008789062, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06827422976493835, "epoch": 0.18472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.32938303885384e-06, "loss": 0.0, "num_tokens": 101040621.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06785356625914574, "epoch": 0.1848, "grad_norm": 0.0, "learning_rate": 3.32918359089401e-06, "loss": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.2265625, "completions/mean_terminated_length": 226.9864959716797, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06897948309779167, "epoch": 0.18488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.328984032408008e-06, "loss": 0.0, "num_tokens": 101136778.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06807850301265717, "epoch": 0.18496, "grad_norm": 0.0, "learning_rate": 3.328784363409801e-06, "loss": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.671875, "completions/mean_terminated_length": 207.31765747070312, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.06421941891312599, "epoch": 0.18504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.328584583913364e-06, "loss": 0.0, "num_tokens": 101230944.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06690340116620064, "epoch": 0.18512, "grad_norm": 0.0, "learning_rate": 3.3283846939326786e-06, "loss": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.0625, "completions/mean_terminated_length": 203.78494262695312, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.08115360885858536, "epoch": 0.1852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3281846934817356e-06, "loss": 0.0, "num_tokens": 101324392.0, "reward": 0.4421311914920807, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4421311914920807, "rewards/reward_fn/std": 0.98649662733078, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08155960589647293, "epoch": 0.18528, "grad_norm": 0.0, "learning_rate": 3.3279845825745335e-06, "loss": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.2109375, "completions/mean_terminated_length": 220.6962127685547, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06160687282681465, "epoch": 0.18536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3277843612250766e-06, "loss": 0.0, "num_tokens": 101419907.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06371621042490005, "epoch": 0.18544, "grad_norm": 0.0, "learning_rate": 3.327584029447379e-06, "loss": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.7109375, "completions/mean_terminated_length": 222.34999084472656, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06860340014100075, "epoch": 0.18552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.327383587255462e-06, "loss": 0.0, "num_tokens": 101514846.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07204844430088997, "epoch": 0.1856, "grad_norm": 0.0, "learning_rate": 3.327183034663354e-06, "loss": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.8984375, "completions/mean_terminated_length": 233.6632537841797, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07266992703080177, "epoch": 0.18568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3269823716850923e-06, "loss": 0.0, "num_tokens": 101610961.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0712648406624794, "epoch": 0.18576, "grad_norm": 0.0, "learning_rate": 3.32678159833472e-06, "loss": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.046875, "completions/mean_terminated_length": 230.4666748046875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07292071729898453, "epoch": 0.18584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.32658071462629e-06, "loss": 0.0, "num_tokens": 101706967.0, "reward": 0.4347124993801117, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4347124993801117, "rewards/reward_fn/std": 0.9859739542007446, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07112331688404083, "epoch": 0.18592, "grad_norm": 0.0, "learning_rate": 3.326379720573862e-06, "loss": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 219.7872314453125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.07087467610836029, "epoch": 0.186, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.326178616191503e-06, "loss": 0.0, "num_tokens": 101801867.0, "reward": 1.2398252487182617, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2398252487182617, "rewards/reward_fn/std": 1.3995047807693481, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07294096425175667, "epoch": 0.18608, "grad_norm": 0.0, "learning_rate": 3.3259774014932888e-06, "loss": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 239.63218688964844, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06986230611801147, "epoch": 0.18616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3257760764933014e-06, "loss": 0.0, "num_tokens": 101898747.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06977679207921028, "epoch": 0.18624, "grad_norm": 0.0, "learning_rate": 3.3255746412056327e-06, "loss": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.8515625, "completions/mean_terminated_length": 234.14285278320312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06487147510051727, "epoch": 0.18632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3253730956443798e-06, "loss": 0.0, "num_tokens": 101995368.0, "reward": 0.07393992692232132, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07393992692232132, "rewards/reward_fn/std": 0.19639533758163452, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06625572592020035, "epoch": 0.1864, "grad_norm": 0.0, "learning_rate": 3.32517143982365e-06, "loss": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 207.811767578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07001060247421265, "epoch": 0.18648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3249696737575553e-06, "loss": 0.0, "num_tokens": 102089576.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0699581615626812, "epoch": 0.18656, "grad_norm": 0.0, "learning_rate": 3.3247677974602187e-06, "loss": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.734375, "completions/mean_terminated_length": 218.87232971191406, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06258172355592251, "epoch": 0.18664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3245658109457692e-06, "loss": 0.0, "num_tokens": 102184390.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.062374627217650414, "epoch": 0.18672, "grad_norm": 0.0, "learning_rate": 3.3243637142283433e-06, "loss": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.015625, "completions/mean_terminated_length": 222.27272033691406, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06430651620030403, "epoch": 0.1868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3241615073220857e-06, "loss": 0.0, "num_tokens": 102278984.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06603909283876419, "epoch": 0.18688, "grad_norm": 0.0, "learning_rate": 3.3239591902411493e-06, "loss": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.859375, "completions/mean_terminated_length": 205.39393615722656, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07016562297940254, "epoch": 0.18696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3237567629996936e-06, "loss": 0.0, "num_tokens": 102372278.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07106206566095352, "epoch": 0.18704, "grad_norm": 0.0, "learning_rate": 3.323554225611886e-06, "loss": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.078125, "completions/mean_terminated_length": 231.5890350341797, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07491078972816467, "epoch": 0.18712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.323351578091903e-06, "loss": 0.0, "num_tokens": 102468800.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07775060087442398, "epoch": 0.1872, "grad_norm": 0.0, "learning_rate": 3.323148820453927e-06, "loss": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.2109375, "completions/mean_terminated_length": 236.60345458984375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06358731910586357, "epoch": 0.18728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.322945952712149e-06, "loss": 0.0, "num_tokens": 102565979.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06066029146313667, "epoch": 0.18736, "grad_norm": 0.0, "learning_rate": 3.322742974880769e-06, "loss": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.2265625, "completions/mean_terminated_length": 204.5757598876953, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.07104703783988953, "epoch": 0.18744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3225398869739905e-06, "loss": 0.0, "num_tokens": 102659192.0, "reward": 0.4416208267211914, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4416208267211914, "rewards/reward_fn/std": 0.9839277863502502, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06956885382533073, "epoch": 0.18752, "grad_norm": 0.0, "learning_rate": 3.32233668900603e-06, "loss": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.78125, "completions/mean_terminated_length": 236.9677276611328, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06378146074712276, "epoch": 0.1876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.322133380991108e-06, "loss": 0.0, "num_tokens": 102756316.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06176583655178547, "epoch": 0.18768, "grad_norm": 0.0, "learning_rate": 3.321929962943454e-06, "loss": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.3828125, "completions/mean_terminated_length": 217.4571533203125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0797700509428978, "epoch": 0.18776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3217264348773053e-06, "loss": 0.0, "num_tokens": 102850573.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07517682015895844, "epoch": 0.18784, "grad_norm": 0.0, "learning_rate": 3.321522796806907e-06, "loss": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.5546875, "completions/mean_terminated_length": 198.40594482421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.0716017335653305, "epoch": 0.18792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3213190487465112e-06, "loss": 0.0, "num_tokens": 102943060.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07188631594181061, "epoch": 0.188, "grad_norm": 0.0, "learning_rate": 3.3211151907103787e-06, "loss": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.453125, "completions/mean_terminated_length": 235.9250030517578, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06477342918515205, "epoch": 0.18808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.320911222712776e-06, "loss": 0.0, "num_tokens": 103039758.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06406110525131226, "epoch": 0.18816, "grad_norm": 0.0, "learning_rate": 3.3207071447679805e-06, "loss": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.6328125, "completions/mean_terminated_length": 193.7058868408203, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06564702466130257, "epoch": 0.18824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3205029568902748e-06, "loss": 0.0, "num_tokens": 103132767.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0703033097088337, "epoch": 0.18832, "grad_norm": 0.0, "learning_rate": 3.320298659093949e-06, "loss": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.3515625, "completions/mean_terminated_length": 238.35714721679688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06231384351849556, "epoch": 0.1884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3200942513933034e-06, "loss": 0.0, "num_tokens": 103229836.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06255125626921654, "epoch": 0.18848, "grad_norm": 0.0, "learning_rate": 3.3198897338026426e-06, "loss": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.8359375, "completions/mean_terminated_length": 217.5154571533203, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.08052615821361542, "epoch": 0.18856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3196851063362824e-06, "loss": 0.0, "num_tokens": 103324407.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07708916440606117, "epoch": 0.18864, "grad_norm": 0.0, "learning_rate": 3.319480369008543e-06, "loss": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.4609375, "completions/mean_terminated_length": 239.4394073486328, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07507878541946411, "epoch": 0.18872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.319275521833755e-06, "loss": 0.0, "num_tokens": 103421618.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07566315308213234, "epoch": 0.1888, "grad_norm": 0.0, "learning_rate": 3.3190705648262553e-06, "loss": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.265625, "completions/mean_terminated_length": 238.69444274902344, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06775955855846405, "epoch": 0.18888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3188654980003885e-06, "loss": 0.0, "num_tokens": 103518676.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0690777488052845, "epoch": 0.18896, "grad_norm": 0.0, "learning_rate": 3.3186603213705075e-06, "loss": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.765625, "completions/mean_terminated_length": 203.82456970214844, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0632549300789833, "epoch": 0.18904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3184550349509717e-06, "loss": 0.0, "num_tokens": 103614006.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663703978061676, "epoch": 0.18912, "grad_norm": 0.0, "learning_rate": 3.31824963875615e-06, "loss": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.53125, "completions/mean_terminated_length": 227.1707305908203, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06660188734531403, "epoch": 0.1892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3180441328004163e-06, "loss": 0.0, "num_tokens": 103709946.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06739116460084915, "epoch": 0.18928, "grad_norm": 0.0, "learning_rate": 3.317838517098156e-06, "loss": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 208.015625, "completions/mean_terminated_length": 186.2045440673828, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06363023072481155, "epoch": 0.18936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3176327916637586e-06, "loss": 0.0, "num_tokens": 103802108.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06564657017588615, "epoch": 0.18944, "grad_norm": 0.0, "learning_rate": 3.3174269565116233e-06, "loss": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.8359375, "completions/mean_terminated_length": 237.671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07301706820726395, "epoch": 0.18952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.317221011656156e-06, "loss": 0.0, "num_tokens": 103899239.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0747075192630291, "epoch": 0.1896, "grad_norm": 0.0, "learning_rate": 3.3170149571117716e-06, "loss": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.3046875, "completions/mean_terminated_length": 240.48934936523438, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06983295828104019, "epoch": 0.18968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.31680879289289e-06, "loss": 0.0, "num_tokens": 103996814.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0721406489610672, "epoch": 0.18976, "grad_norm": 0.0, "learning_rate": 3.3166025190139422e-06, "loss": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.828125, "completions/mean_terminated_length": 205.8148193359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07335049659013748, "epoch": 0.18984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.316396135489364e-06, "loss": 0.0, "num_tokens": 104092408.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07515867799520493, "epoch": 0.18992, "grad_norm": 0.0, "learning_rate": 3.3161896423336006e-06, "loss": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.2421875, "completions/mean_terminated_length": 237.6516876220703, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07050692662596703, "epoch": 0.19, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3159830395611048e-06, "loss": 0.0, "num_tokens": 104189079.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.071758221834898, "epoch": 0.19008, "grad_norm": 0.0, "learning_rate": 3.315776327186336e-06, "loss": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.8671875, "completions/mean_terminated_length": 235.4431915283203, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06721116602420807, "epoch": 0.19016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.315569505223762e-06, "loss": 0.0, "num_tokens": 104285574.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663900189101696, "epoch": 0.19024, "grad_norm": 0.0, "learning_rate": 3.315362573687858e-06, "loss": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.0234375, "completions/mean_terminated_length": 240.86441040039062, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06688183173537254, "epoch": 0.19032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3151555325931074e-06, "loss": 0.0, "num_tokens": 104382985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06494142860174179, "epoch": 0.1904, "grad_norm": 0.0, "learning_rate": 3.3149483819540004e-06, "loss": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.2265625, "completions/mean_terminated_length": 206.89334106445312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07456713914871216, "epoch": 0.19048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.314741121785036e-06, "loss": 0.0, "num_tokens": 104477606.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07447347417473793, "epoch": 0.19056, "grad_norm": 0.0, "learning_rate": 3.3145337521007197e-06, "loss": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.3203125, "completions/mean_terminated_length": 208.1097412109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07203186675906181, "epoch": 0.19064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3143262729155653e-06, "loss": 0.0, "num_tokens": 104571983.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07128269225358963, "epoch": 0.19072, "grad_norm": 0.0, "learning_rate": 3.3141186842440943e-06, "loss": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.859375, "completions/mean_terminated_length": 237.51649475097656, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.07155065983533859, "epoch": 0.1908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.313910986100836e-06, "loss": 0.0, "num_tokens": 104668605.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.072622399777174, "epoch": 0.19088, "grad_norm": 0.0, "learning_rate": 3.3137031785003254e-06, "loss": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.84375, "completions/mean_terminated_length": 191.0117645263672, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07263830304145813, "epoch": 0.19096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3134952614571096e-06, "loss": 0.0, "num_tokens": 104761385.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07252204790711403, "epoch": 0.19104, "grad_norm": 0.0, "learning_rate": 3.3132872349857386e-06, "loss": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.5078125, "completions/mean_terminated_length": 209.38571166992188, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06874676793813705, "epoch": 0.19112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3130790991007725e-06, "loss": 0.0, "num_tokens": 104856426.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06338484212756157, "epoch": 0.1912, "grad_norm": 0.0, "learning_rate": 3.3128708538167787e-06, "loss": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 192.546875, "completions/mean_terminated_length": 182.82882690429688, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.0665342677384615, "epoch": 0.19128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.312662499148332e-06, "loss": 0.0, "num_tokens": 104946608.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06340726092457771, "epoch": 0.19136, "grad_norm": 0.0, "learning_rate": 3.3124540351100155e-06, "loss": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.4921875, "completions/mean_terminated_length": 218.9368438720703, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07097659632563591, "epoch": 0.19144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.312245461716419e-06, "loss": 0.0, "num_tokens": 105041391.0, "reward": 0.7953384518623352, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7953384518623352, "rewards/reward_fn/std": 1.28325617313385, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07298946753144264, "epoch": 0.19152, "grad_norm": 0.0, "learning_rate": 3.3120367789821402e-06, "loss": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.4140625, "completions/mean_terminated_length": 232.0531768798828, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07182908430695534, "epoch": 0.1916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.311827986921786e-06, "loss": 0.0, "num_tokens": 105137444.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07175513356924057, "epoch": 0.19168, "grad_norm": 0.0, "learning_rate": 3.311619085549968e-06, "loss": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.6640625, "completions/mean_terminated_length": 205.20237731933594, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0607123076915741, "epoch": 0.19176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3114100748813086e-06, "loss": 0.0, "num_tokens": 105231481.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060876697301864624, "epoch": 0.19184, "grad_norm": 0.0, "learning_rate": 3.311200954930435e-06, "loss": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.3515625, "completions/mean_terminated_length": 237.32876586914062, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07476688176393509, "epoch": 0.19192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.310991725711984e-06, "loss": 0.0, "num_tokens": 105328422.0, "reward": 0.10638301074504852, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10638301074504852, "rewards/reward_fn/std": 0.28256893157958984, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07511190697550774, "epoch": 0.192, "grad_norm": 0.0, "learning_rate": 3.3107823872405996e-06, "loss": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 239.89474487304688, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.07535763829946518, "epoch": 0.19208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3105729395309324e-06, "loss": 0.0, "num_tokens": 105425502.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07845424115657806, "epoch": 0.19216, "grad_norm": 0.0, "learning_rate": 3.3103633825976428e-06, "loss": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 198.7578125, "completions/mean_terminated_length": 189.39089965820312, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06926099956035614, "epoch": 0.19224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.310153716455397e-06, "loss": 0.0, "num_tokens": 105516479.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06957071274518967, "epoch": 0.19232, "grad_norm": 0.0, "learning_rate": 3.309943941118869e-06, "loss": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.296875, "completions/mean_terminated_length": 193.2911376953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06924666464328766, "epoch": 0.1924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3097340566027416e-06, "loss": 0.0, "num_tokens": 105609829.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06977450847625732, "epoch": 0.19248, "grad_norm": 0.0, "learning_rate": 3.309524062921704e-06, "loss": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.6640625, "completions/mean_terminated_length": 209.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06525248289108276, "epoch": 0.19256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.309313960090454e-06, "loss": 0.0, "num_tokens": 105704890.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06641180068254471, "epoch": 0.19264, "grad_norm": 0.0, "learning_rate": 3.309103748123696e-06, "loss": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.328125, "completions/mean_terminated_length": 202.87754821777344, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06593411788344383, "epoch": 0.19272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3088934270361427e-06, "loss": 0.0, "num_tokens": 105797988.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06989593431353569, "epoch": 0.1928, "grad_norm": 0.0, "learning_rate": 3.3086829968425148e-06, "loss": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.9296875, "completions/mean_terminated_length": 227.44444274902344, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07317698001861572, "epoch": 0.19288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3084724575575396e-06, "loss": 0.0, "num_tokens": 105893979.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07455718889832497, "epoch": 0.19296, "grad_norm": 0.0, "learning_rate": 3.3082618091959535e-06, "loss": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.6953125, "completions/mean_terminated_length": 206.92425537109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06241528503596783, "epoch": 0.19304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.308051051772499e-06, "loss": 0.0, "num_tokens": 105989044.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060893140733242035, "epoch": 0.19312, "grad_norm": 0.0, "learning_rate": 3.307840185301927e-06, "loss": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.5234375, "completions/mean_terminated_length": 232.0341033935547, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06484416872262955, "epoch": 0.1932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.307629209798995e-06, "loss": 0.0, "num_tokens": 106085239.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06812063604593277, "epoch": 0.19328, "grad_norm": 0.0, "learning_rate": 3.307418125278471e-06, "loss": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.9453125, "completions/mean_terminated_length": 203.19117736816406, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06980730220675468, "epoch": 0.19336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.307206931755127e-06, "loss": 0.0, "num_tokens": 106179952.0, "reward": 0.49996522068977356, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49996522068977356, "rewards/reward_fn/std": 1.0039118528366089, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07378926873207092, "epoch": 0.19344, "grad_norm": 0.0, "learning_rate": 3.3069956292437456e-06, "loss": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.078125, "completions/mean_terminated_length": 220.1573028564453, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06680677458643913, "epoch": 0.19352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.306784217759115e-06, "loss": 0.0, "num_tokens": 106275066.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06717430800199509, "epoch": 0.1936, "grad_norm": 0.0, "learning_rate": 3.3065726973160316e-06, "loss": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.6953125, "completions/mean_terminated_length": 213.37648010253906, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06907090917229652, "epoch": 0.19368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3063610679293003e-06, "loss": 0.0, "num_tokens": 106369747.0, "reward": 0.0962333157658577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0962333157658577, "rewards/reward_fn/std": 0.2556098699569702, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06861745193600655, "epoch": 0.19376, "grad_norm": 0.0, "learning_rate": 3.3061493296137322e-06, "loss": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.7734375, "completions/mean_terminated_length": 236.8400115966797, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06988874822854996, "epoch": 0.19384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3059374823841474e-06, "loss": 0.0, "num_tokens": 106466614.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102864980697632, "epoch": 0.19392, "grad_norm": 0.0, "learning_rate": 3.3057255262553724e-06, "loss": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.4921875, "completions/mean_terminated_length": 221.34861755371094, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06771031394600868, "epoch": 0.194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.305513461242242e-06, "loss": 0.0, "num_tokens": 106561141.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06998111680150032, "epoch": 0.19408, "grad_norm": 0.0, "learning_rate": 3.3053012873595986e-06, "loss": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.8828125, "completions/mean_terminated_length": 210.1326446533203, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07784581929445267, "epoch": 0.19416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.305089004622293e-06, "loss": 0.0, "num_tokens": 106654950.0, "reward": 0.4246163070201874, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4246163070201874, "rewards/reward_fn/std": 0.9858949184417725, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07983610779047012, "epoch": 0.19424, "grad_norm": 0.0, "learning_rate": 3.304876613045181e-06, "loss": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.6484375, "completions/mean_terminated_length": 217.88394165039062, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06261102668941021, "epoch": 0.19432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3046641126431294e-06, "loss": 0.0, "num_tokens": 106748985.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06025497615337372, "epoch": 0.1944, "grad_norm": 0.0, "learning_rate": 3.3044515034310096e-06, "loss": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.8828125, "completions/mean_terminated_length": 204.01190185546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06797553598880768, "epoch": 0.19448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.304238785423703e-06, "loss": 0.0, "num_tokens": 106842922.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06919921934604645, "epoch": 0.19456, "grad_norm": 0.0, "learning_rate": 3.304025958636097e-06, "loss": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.984375, "completions/mean_terminated_length": 206.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.06975805386900902, "epoch": 0.19464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.303813023083088e-06, "loss": 0.0, "num_tokens": 106937256.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06809673458337784, "epoch": 0.19472, "grad_norm": 0.0, "learning_rate": 3.3035999787795788e-06, "loss": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.3828125, "completions/mean_terminated_length": 208.01266479492188, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.060903456062078476, "epoch": 0.1948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.303386825740479e-06, "loss": 0.0, "num_tokens": 107031769.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06797906011343002, "epoch": 0.19488, "grad_norm": 0.0, "learning_rate": 3.30317356398071e-06, "loss": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.0078125, "completions/mean_terminated_length": 216.63076782226562, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06275503896176815, "epoch": 0.19496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.302960193515195e-06, "loss": 0.0, "num_tokens": 107127514.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06462889909744263, "epoch": 0.19504, "grad_norm": 0.0, "learning_rate": 3.302746714358869e-06, "loss": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.3046875, "completions/mean_terminated_length": 216.0760955810547, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06836968660354614, "epoch": 0.19512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.302533126526673e-06, "loss": 0.0, "num_tokens": 107222145.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07171107828617096, "epoch": 0.1952, "grad_norm": 0.0, "learning_rate": 3.3023194300335566e-06, "loss": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.5703125, "completions/mean_terminated_length": 198.64773559570312, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07078546285629272, "epoch": 0.19528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.302105624894475e-06, "loss": 0.0, "num_tokens": 107315402.0, "reward": 0.7574909925460815, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7574909925460815, "rewards/reward_fn/std": 1.29994535446167, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07130282744765282, "epoch": 0.19536, "grad_norm": 0.0, "learning_rate": 3.3018917111243933e-06, "loss": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 222.40000915527344, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06725245341658592, "epoch": 0.19544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.301677688738283e-06, "loss": 0.0, "num_tokens": 107411690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0723147988319397, "epoch": 0.19552, "grad_norm": 0.0, "learning_rate": 3.301463557751123e-06, "loss": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 238.5066680908203, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06183089688420296, "epoch": 0.1956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.3012493181779004e-06, "loss": 0.0, "num_tokens": 107508682.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06347677111625671, "epoch": 0.19568, "grad_norm": 0.0, "learning_rate": 3.3010349700336105e-06, "loss": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.9296875, "completions/mean_terminated_length": 238.58108520507812, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.07135406136512756, "epoch": 0.19576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.300820513333254e-06, "loss": 0.0, "num_tokens": 107605697.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.073697280138731, "epoch": 0.19584, "grad_norm": 0.0, "learning_rate": 3.300605948091842e-06, "loss": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 222.17721557617188, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06344162300229073, "epoch": 0.19592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.300391274324391e-06, "loss": 0.0, "num_tokens": 107701329.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06296905502676964, "epoch": 0.196, "grad_norm": 0.0, "learning_rate": 3.3001764920459253e-06, "loss": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.7109375, "completions/mean_terminated_length": 235.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06856486946344376, "epoch": 0.19608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2999616012714784e-06, "loss": 0.0, "num_tokens": 107798444.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06862469762563705, "epoch": 0.19616, "grad_norm": 0.0, "learning_rate": 3.2997466020160904e-06, "loss": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.5546875, "completions/mean_terminated_length": 196.5500030517578, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06738241016864777, "epoch": 0.19624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.299531494294809e-06, "loss": 0.0, "num_tokens": 107890803.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06646697968244553, "epoch": 0.19632, "grad_norm": 0.0, "learning_rate": 3.2993162781226884e-06, "loss": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.8125, "completions/mean_terminated_length": 238.64151000976562, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06935854256153107, "epoch": 0.1964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.299100953514793e-06, "loss": 0.0, "num_tokens": 107988187.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06738533079624176, "epoch": 0.19648, "grad_norm": 0.0, "learning_rate": 3.2988855204861917e-06, "loss": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.703125, "completions/mean_terminated_length": 221.1951141357422, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07501719892024994, "epoch": 0.19656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2986699790519633e-06, "loss": 0.0, "num_tokens": 108083637.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07681635022163391, "epoch": 0.19664, "grad_norm": 0.0, "learning_rate": 3.298454329227194e-06, "loss": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.46875, "completions/mean_terminated_length": 235.45631408691406, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06815189495682716, "epoch": 0.19672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.298238571026976e-06, "loss": 0.0, "num_tokens": 108179825.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06803971156477928, "epoch": 0.1968, "grad_norm": 0.0, "learning_rate": 3.298022704466411e-06, "loss": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 195.3170623779297, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06880791112780571, "epoch": 0.19688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2978067295606064e-06, "loss": 0.0, "num_tokens": 108273153.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06583361327648163, "epoch": 0.19696, "grad_norm": 0.0, "learning_rate": 3.297590646324679e-06, "loss": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 197.0703125, "completions/mean_terminated_length": 181.31683349609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07122499868273735, "epoch": 0.19704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.297374454773751e-06, "loss": 0.0, "num_tokens": 108363914.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07006790116429329, "epoch": 0.19712, "grad_norm": 0.0, "learning_rate": 3.297158154922956e-06, "loss": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 235.16883850097656, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07124955207109451, "epoch": 0.1972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2969417467874304e-06, "loss": 0.0, "num_tokens": 108460614.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07267681509256363, "epoch": 0.19728, "grad_norm": 0.0, "learning_rate": 3.2967252303823214e-06, "loss": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1640625, "completions/mean_terminated_length": 231.78651428222656, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.0678563266992569, "epoch": 0.19736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.296508605722783e-06, "loss": 0.0, "num_tokens": 108556763.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06857882812619209, "epoch": 0.19744, "grad_norm": 0.0, "learning_rate": 3.296291872823976e-06, "loss": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.4140625, "completions/mean_terminated_length": 203.70652770996094, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07402875274419785, "epoch": 0.19752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.29607503170107e-06, "loss": 0.0, "num_tokens": 108650256.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07353029772639275, "epoch": 0.1976, "grad_norm": 0.0, "learning_rate": 3.2958580823692414e-06, "loss": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.2421875, "completions/mean_terminated_length": 232.2027130126953, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07375185564160347, "epoch": 0.19768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2956410248436745e-06, "loss": 0.0, "num_tokens": 108746799.0, "reward": 0.12208539247512817, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12208539247512817, "rewards/reward_fn/std": 0.32427677512168884, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07213269174098969, "epoch": 0.19776, "grad_norm": 0.0, "learning_rate": 3.295423859139561e-06, "loss": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 232.494384765625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0627228170633316, "epoch": 0.19784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2952065852721e-06, "loss": 0.0, "num_tokens": 108843011.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05940704233944416, "epoch": 0.19792, "grad_norm": 0.0, "learning_rate": 3.2949892032564986e-06, "loss": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.3046875, "completions/mean_terminated_length": 223.21917724609375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.0748186819255352, "epoch": 0.198, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2947717131079713e-06, "loss": 0.0, "num_tokens": 108938922.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07716908305883408, "epoch": 0.19808, "grad_norm": 0.0, "learning_rate": 3.2945541148417394e-06, "loss": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4765625, "completions/mean_terminated_length": 240.15293884277344, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06482961401343346, "epoch": 0.19816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.294336408473034e-06, "loss": 0.0, "num_tokens": 109035879.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06445565819740295, "epoch": 0.19824, "grad_norm": 0.0, "learning_rate": 3.2941185940170904e-06, "loss": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.0234375, "completions/mean_terminated_length": 222.2747344970703, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07925298064947128, "epoch": 0.19832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2939006714891542e-06, "loss": 0.0, "num_tokens": 109131114.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07853778079152107, "epoch": 0.1984, "grad_norm": 0.0, "learning_rate": 3.293682640904478e-06, "loss": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.296875, "completions/mean_terminated_length": 234.36781311035156, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.06966306269168854, "epoch": 0.19848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.293464502278321e-06, "loss": 0.0, "num_tokens": 109227536.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06984518840909004, "epoch": 0.19856, "grad_norm": 0.0, "learning_rate": 3.2932462556259513e-06, "loss": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.9453125, "completions/mean_terminated_length": 206.22352600097656, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07902586460113525, "epoch": 0.19864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.293027900962643e-06, "loss": 0.0, "num_tokens": 109321609.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0810917466878891, "epoch": 0.19872, "grad_norm": 0.0, "learning_rate": 3.2928094383036797e-06, "loss": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.4609375, "completions/mean_terminated_length": 227.9859161376953, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.0720767043530941, "epoch": 0.1988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2925908676643504e-06, "loss": 0.0, "num_tokens": 109417924.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07269517704844475, "epoch": 0.19888, "grad_norm": 0.0, "learning_rate": 3.2923721890599532e-06, "loss": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.015625, "completions/mean_terminated_length": 215.05999755859375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.0703001543879509, "epoch": 0.19896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2921534025057936e-06, "loss": 0.0, "num_tokens": 109512134.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07107341289520264, "epoch": 0.19904, "grad_norm": 0.0, "learning_rate": 3.291934508017184e-06, "loss": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 208.90321350097656, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06461343541741371, "epoch": 0.19912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2917155056094444e-06, "loss": 0.0, "num_tokens": 109607518.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06237632781267166, "epoch": 0.1992, "grad_norm": 0.0, "learning_rate": 3.2914963952979035e-06, "loss": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.8125, "completions/mean_terminated_length": 206.5054931640625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06691541522741318, "epoch": 0.19928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.291277177097896e-06, "loss": 0.0, "num_tokens": 109701318.0, "reward": 0.4633024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4633024334907532, "rewards/reward_fn/std": 0.9901458024978638, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0701085664331913, "epoch": 0.19936, "grad_norm": 0.0, "learning_rate": 3.291057851024765e-06, "loss": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0546875, "completions/mean_terminated_length": 232.08750915527344, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06375176087021828, "epoch": 0.19944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2908384170938612e-06, "loss": 0.0, "num_tokens": 109797709.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06523089855909348, "epoch": 0.19952, "grad_norm": 0.0, "learning_rate": 3.290618875320542e-06, "loss": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.9453125, "completions/mean_terminated_length": 241.20689392089844, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.06747514754533768, "epoch": 0.1996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.290399225720174e-06, "loss": 0.0, "num_tokens": 109894726.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07052789628505707, "epoch": 0.19968, "grad_norm": 0.0, "learning_rate": 3.29017946830813e-06, "loss": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.6953125, "completions/mean_terminated_length": 201.31521606445312, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.0673823393881321, "epoch": 0.19976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.28995960309979e-06, "loss": 0.0, "num_tokens": 109987999.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06604842096567154, "epoch": 0.19984, "grad_norm": 0.0, "learning_rate": 3.2897396301105433e-06, "loss": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 192.71641540527344, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06772205606102943, "epoch": 0.19992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2895195493557853e-06, "loss": 0.0, "num_tokens": 110082063.0, "reward": 0.5031033754348755, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5031033754348755, "rewards/reward_fn/std": 0.9848445057868958, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06611407175660133, "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 3.2892993608509194e-06, "loss": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.7890625, "completions/mean_terminated_length": 235.89230346679688, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07256744056940079, "epoch": 0.20008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2890790646113556e-06, "loss": 0.0, "num_tokens": 110179060.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07274351269006729, "epoch": 0.20016, "grad_norm": 0.0, "learning_rate": 3.2888586606525135e-06, "loss": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.0703125, "completions/mean_terminated_length": 233.3035888671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06945213675498962, "epoch": 0.20024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2886381489898186e-06, "loss": 0.0, "num_tokens": 110276093.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06829946860671043, "epoch": 0.20032, "grad_norm": 0.0, "learning_rate": 3.2884175296387044e-06, "loss": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.40625, "completions/mean_terminated_length": 210.9253692626953, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06726205348968506, "epoch": 0.2004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.288196802614612e-06, "loss": 0.0, "num_tokens": 110371377.0, "reward": 0.857715368270874, "reward_std": 0.0, "rewards/reward_fn/mean": 0.857715368270874, "rewards/reward_fn/std": 1.272713541984558, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06999707221984863, "epoch": 0.20048, "grad_norm": 0.0, "learning_rate": 3.2879759679329893e-06, "loss": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.078125, "completions/mean_terminated_length": 218.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07133474573493004, "epoch": 0.20056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.287755025609293e-06, "loss": 0.0, "num_tokens": 110467131.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07154396176338196, "epoch": 0.20064, "grad_norm": 0.0, "learning_rate": 3.2875339756589874e-06, "loss": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.15625, "completions/mean_terminated_length": 217.69862365722656, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07537145167589188, "epoch": 0.20072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2873128180975426e-06, "loss": 0.0, "num_tokens": 110562639.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07081526517868042, "epoch": 0.2008, "grad_norm": 0.0, "learning_rate": 3.287091552940437e-06, "loss": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.8359375, "completions/mean_terminated_length": 218.36793518066406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0649118423461914, "epoch": 0.20088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2868701802031584e-06, "loss": 0.0, "num_tokens": 110656954.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06611841544508934, "epoch": 0.20096, "grad_norm": 0.0, "learning_rate": 3.2866486999011988e-06, "loss": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.5234375, "completions/mean_terminated_length": 189.85227966308594, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06708624586462975, "epoch": 0.20104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.286427112050061e-06, "loss": 0.0, "num_tokens": 110749437.0, "reward": 0.759978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.759978711605072, "rewards/reward_fn/std": 1.2986160516738892, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06837008520960808, "epoch": 0.20112, "grad_norm": 0.0, "learning_rate": 3.286205416665252e-06, "loss": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.84375, "completions/mean_terminated_length": 237.6981201171875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.06463025510311127, "epoch": 0.2012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2859836137622904e-06, "loss": 0.0, "num_tokens": 110845801.0, "reward": 0.11692613363265991, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11692613363265991, "rewards/reward_fn/std": 0.310573011636734, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06349900737404823, "epoch": 0.20128, "grad_norm": 0.0, "learning_rate": 3.2857617033566983e-06, "loss": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.140625, "completions/mean_terminated_length": 206.88571166992188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06428119167685509, "epoch": 0.20136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.285539685464008e-06, "loss": 0.0, "num_tokens": 110940667.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06542780250310898, "epoch": 0.20144, "grad_norm": 0.0, "learning_rate": 3.2853175600997583e-06, "loss": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 241.2890625, "completions/mean_terminated_length": 227.89552307128906, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.07601885497570038, "epoch": 0.20152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2850953272794944e-06, "loss": 0.0, "num_tokens": 111037088.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07499367371201515, "epoch": 0.2016, "grad_norm": 0.0, "learning_rate": 3.284872987018772e-06, "loss": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 208.0390625, "completions/mean_terminated_length": 182.9166717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07520382478833199, "epoch": 0.20168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2846505393331515e-06, "loss": 0.0, "num_tokens": 111129253.0, "reward": 0.4741498827934265, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4741498827934265, "rewards/reward_fn/std": 0.9932445883750916, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07578258588910103, "epoch": 0.20176, "grad_norm": 0.0, "learning_rate": 3.2844279842382027e-06, "loss": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.3984375, "completions/mean_terminated_length": 217.4470672607422, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07255223393440247, "epoch": 0.20184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.284205321749501e-06, "loss": 0.0, "num_tokens": 111224280.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07259098440408707, "epoch": 0.20192, "grad_norm": 0.0, "learning_rate": 3.2839825518826318e-06, "loss": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.7265625, "completions/mean_terminated_length": 186.7014923095703, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07659575343132019, "epoch": 0.202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2837596746531855e-06, "loss": 0.0, "num_tokens": 111317941.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07723431289196014, "epoch": 0.20208, "grad_norm": 0.0, "learning_rate": 3.283536690076761e-06, "loss": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.71875, "completions/mean_terminated_length": 204.65753173828125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07215230911970139, "epoch": 0.20216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.283313598168966e-06, "loss": 0.0, "num_tokens": 111412497.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07813703268766403, "epoch": 0.20224, "grad_norm": 0.0, "learning_rate": 3.283090398945414e-06, "loss": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1796875, "completions/mean_terminated_length": 237.35227966308594, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06681952625513077, "epoch": 0.20232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2828670924217267e-06, "loss": 0.0, "num_tokens": 111509160.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06723781675100327, "epoch": 0.2024, "grad_norm": 0.0, "learning_rate": 3.2826436786135326e-06, "loss": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.078125, "completions/mean_terminated_length": 242.10958862304688, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06482930481433868, "epoch": 0.20248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.282420157536469e-06, "loss": 0.0, "num_tokens": 111606450.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06556207686662674, "epoch": 0.20256, "grad_norm": 0.0, "learning_rate": 3.2821965292061798e-06, "loss": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.09375, "completions/mean_terminated_length": 236.1875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06946680322289467, "epoch": 0.20264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.281972793638317e-06, "loss": 0.0, "num_tokens": 111703486.0, "reward": 0.27467191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.27467191219329834, "rewards/reward_fn/std": 0.6578801274299622, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07146774977445602, "epoch": 0.20272, "grad_norm": 0.0, "learning_rate": 3.2817489508485383e-06, "loss": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.671875, "completions/mean_terminated_length": 204.89108276367188, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.0733405351638794, "epoch": 0.2028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.281525000852512e-06, "loss": 0.0, "num_tokens": 111796628.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07087389752268791, "epoch": 0.20288, "grad_norm": 0.0, "learning_rate": 3.281300943665912e-06, "loss": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.6171875, "completions/mean_terminated_length": 222.11712646484375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06638815999031067, "epoch": 0.20296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.281076779304419e-06, "loss": 0.0, "num_tokens": 111891171.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07214123010635376, "epoch": 0.20304, "grad_norm": 0.0, "learning_rate": 3.280852507783723e-06, "loss": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.734375, "completions/mean_terminated_length": 201.9569854736328, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07042960077524185, "epoch": 0.20312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.28062812911952e-06, "loss": 0.0, "num_tokens": 111984449.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06949379295110703, "epoch": 0.2032, "grad_norm": 0.0, "learning_rate": 3.2804036433275143e-06, "loss": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 217.1914825439453, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07233266904950142, "epoch": 0.20328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.280179050423418e-06, "loss": 0.0, "num_tokens": 112079105.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0714748278260231, "epoch": 0.20336, "grad_norm": 0.0, "learning_rate": 3.27995435042295e-06, "loss": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.8125, "completions/mean_terminated_length": 229.09091186523438, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.0668560341000557, "epoch": 0.20344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.279729543341837e-06, "loss": 0.0, "num_tokens": 112175337.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06516899541020393, "epoch": 0.20352, "grad_norm": 0.0, "learning_rate": 3.2795046291958127e-06, "loss": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.6796875, "completions/mean_terminated_length": 229.94444274902344, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06607882678508759, "epoch": 0.2036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.279279608000619e-06, "loss": 0.0, "num_tokens": 112271296.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0645288061350584, "epoch": 0.20368, "grad_norm": 0.0, "learning_rate": 3.2790544797720047e-06, "loss": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.4140625, "completions/mean_terminated_length": 238.27418518066406, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.06727615371346474, "epoch": 0.20376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.278829244525727e-06, "loss": 0.0, "num_tokens": 112368501.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0688721276819706, "epoch": 0.20384, "grad_norm": 0.0, "learning_rate": 3.278603902277549e-06, "loss": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 225.61111450195312, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06447416171431541, "epoch": 0.20392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.278378453043244e-06, "loss": 0.0, "num_tokens": 112464617.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06561991944909096, "epoch": 0.204, "grad_norm": 0.0, "learning_rate": 3.2781528968385898e-06, "loss": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.34375, "completions/mean_terminated_length": 192.5500030517578, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.061525559052824974, "epoch": 0.20408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.277927233679373e-06, "loss": 0.0, "num_tokens": 112557845.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0613729152828455, "epoch": 0.20416, "grad_norm": 0.0, "learning_rate": 3.2777014635813876e-06, "loss": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.6875, "completions/mean_terminated_length": 227.4146270751953, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.0766979493200779, "epoch": 0.20424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2774755865604352e-06, "loss": 0.0, "num_tokens": 112653805.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07690592482686043, "epoch": 0.20432, "grad_norm": 0.0, "learning_rate": 3.2772496026323252e-06, "loss": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.4609375, "completions/mean_terminated_length": 219.89654541015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06512941420078278, "epoch": 0.2044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.277023511812874e-06, "loss": 0.0, "num_tokens": 112748968.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06622936576604843, "epoch": 0.20448, "grad_norm": 0.0, "learning_rate": 3.276797314117905e-06, "loss": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 193.3203125, "completions/mean_terminated_length": 177.3431396484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06613750010728836, "epoch": 0.20456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.27657100956325e-06, "loss": 0.0, "num_tokens": 112839249.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0640804823487997, "epoch": 0.20464, "grad_norm": 0.0, "learning_rate": 3.2763445981647484e-06, "loss": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.3984375, "completions/mean_terminated_length": 216.2571563720703, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07358650863170624, "epoch": 0.20472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.276118079938246e-06, "loss": 0.0, "num_tokens": 112933380.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06891267374157906, "epoch": 0.2048, "grad_norm": 0.0, "learning_rate": 3.275891454899597e-06, "loss": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 220.2109375, "completions/mean_terminated_length": 199.44444274902344, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06842944771051407, "epoch": 0.20488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2756647230646624e-06, "loss": 0.0, "num_tokens": 113027103.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0724778100848198, "epoch": 0.20496, "grad_norm": 0.0, "learning_rate": 3.2754378844493114e-06, "loss": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.578125, "completions/mean_terminated_length": 204.52427673339844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06496055237948895, "epoch": 0.20504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2752109390694203e-06, "loss": 0.0, "num_tokens": 113120105.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06359249725937843, "epoch": 0.20512, "grad_norm": 0.0, "learning_rate": 3.274983886940872e-06, "loss": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 217.3658447265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06790326535701752, "epoch": 0.2052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2747567280795594e-06, "loss": 0.0, "num_tokens": 113215241.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06655895337462425, "epoch": 0.20528, "grad_norm": 0.0, "learning_rate": 3.27452946250138e-06, "loss": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 224.19752502441406, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.0656946562230587, "epoch": 0.20536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.27430209022224e-06, "loss": 0.0, "num_tokens": 113310969.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06520058214664459, "epoch": 0.20544, "grad_norm": 0.0, "learning_rate": 3.2740746112580537e-06, "loss": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.8984375, "completions/mean_terminated_length": 240.0410919189453, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06768885254859924, "epoch": 0.20552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2738470256247415e-06, "loss": 0.0, "num_tokens": 113408108.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06487501785159111, "epoch": 0.2056, "grad_norm": 0.0, "learning_rate": 3.2736193333382326e-06, "loss": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.8359375, "completions/mean_terminated_length": 228.74444580078125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07277275621891022, "epoch": 0.20568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.273391534414463e-06, "loss": 0.0, "num_tokens": 113503959.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07300513982772827, "epoch": 0.20576, "grad_norm": 0.0, "learning_rate": 3.2731636288693756e-06, "loss": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.1015625, "completions/mean_terminated_length": 192.1969757080078, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06404099985957146, "epoch": 0.20584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2729356167189222e-06, "loss": 0.0, "num_tokens": 113598052.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07002417743206024, "epoch": 0.20592, "grad_norm": 0.0, "learning_rate": 3.272707497979061e-06, "loss": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.0390625, "completions/mean_terminated_length": 236.9310302734375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06306781806051731, "epoch": 0.206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2724792726657575e-06, "loss": 0.0, "num_tokens": 113694697.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06353560835123062, "epoch": 0.20608, "grad_norm": 0.0, "learning_rate": 3.2722509407949856e-06, "loss": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 199.4453125, "completions/mean_terminated_length": 175.56666564941406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07350925356149673, "epoch": 0.20616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2720225023827253e-06, "loss": 0.0, "num_tokens": 113785762.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07318969815969467, "epoch": 0.20624, "grad_norm": 0.0, "learning_rate": 3.271793957444966e-06, "loss": 0.0, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.359375, "completions/mean_terminated_length": 209.09999084472656, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06354496628046036, "epoch": 0.20632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2715653059977027e-06, "loss": 0.0, "num_tokens": 113879376.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06245886906981468, "epoch": 0.2064, "grad_norm": 0.0, "learning_rate": 3.2713365480569383e-06, "loss": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 244.4140625, "completions/mean_terminated_length": 235.95947265625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.07299668714404106, "epoch": 0.20648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2711076836386844e-06, "loss": 0.0, "num_tokens": 113976197.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07256153970956802, "epoch": 0.20656, "grad_norm": 0.0, "learning_rate": 3.2708787127589586e-06, "loss": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.53125, "completions/mean_terminated_length": 216.17544555664062, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06927139312028885, "epoch": 0.20664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.270649635433786e-06, "loss": 0.0, "num_tokens": 114069961.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07008027285337448, "epoch": 0.20672, "grad_norm": 0.0, "learning_rate": 3.2704204516792004e-06, "loss": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.828125, "completions/mean_terminated_length": 218.35955810546875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07222796604037285, "epoch": 0.2068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2701911615112416e-06, "loss": 0.0, "num_tokens": 114164915.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07268951460719109, "epoch": 0.20688, "grad_norm": 0.0, "learning_rate": 3.2699617649459577e-06, "loss": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.984375, "completions/mean_terminated_length": 234.93150329589844, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07137826457619667, "epoch": 0.20696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2697322619994047e-06, "loss": 0.0, "num_tokens": 114261681.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0705074816942215, "epoch": 0.20704, "grad_norm": 0.0, "learning_rate": 3.2695026526876443e-06, "loss": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.21875, "completions/mean_terminated_length": 197.4712677001953, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.060930997133255005, "epoch": 0.20712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.269272937026747e-06, "loss": 0.0, "num_tokens": 114354893.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06070849671959877, "epoch": 0.2072, "grad_norm": 0.0, "learning_rate": 3.269043115032791e-06, "loss": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.9140625, "completions/mean_terminated_length": 209.3116912841797, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06468197703361511, "epoch": 0.20728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2688131867218607e-06, "loss": 0.0, "num_tokens": 114449602.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06140913628041744, "epoch": 0.20736, "grad_norm": 0.0, "learning_rate": 3.2685831521100497e-06, "loss": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.9375, "completions/mean_terminated_length": 235.5454559326172, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07186578959226608, "epoch": 0.20744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2683530112134564e-06, "loss": 0.0, "num_tokens": 114546106.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07296684384346008, "epoch": 0.20752, "grad_norm": 0.0, "learning_rate": 3.26812276404819e-06, "loss": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6015625, "completions/mean_terminated_length": 216.7260284423828, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.064118891954422, "epoch": 0.2076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.267892410630364e-06, "loss": 0.0, "num_tokens": 114641543.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06601181998848915, "epoch": 0.20768, "grad_norm": 0.0, "learning_rate": 3.2676619509761016e-06, "loss": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.3046875, "completions/mean_terminated_length": 240.10000610351562, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07163849845528603, "epoch": 0.20776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.267431385101532e-06, "loss": 0.0, "num_tokens": 114738734.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07200422883033752, "epoch": 0.20784, "grad_norm": 0.0, "learning_rate": 3.267200713022793e-06, "loss": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.546875, "completions/mean_terminated_length": 210.63768005371094, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0722067803144455, "epoch": 0.20792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2669699347560275e-06, "loss": 0.0, "num_tokens": 114833908.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07211308553814888, "epoch": 0.208, "grad_norm": 0.0, "learning_rate": 3.26673905031739e-06, "loss": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.7421875, "completions/mean_terminated_length": 237.09637451171875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07059244439005852, "epoch": 0.20808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.266508059723038e-06, "loss": 0.0, "num_tokens": 114930643.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07173363491892815, "epoch": 0.20816, "grad_norm": 0.0, "learning_rate": 3.2662769629891397e-06, "loss": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.2890625, "completions/mean_terminated_length": 229.38890075683594, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07043102383613586, "epoch": 0.20824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.266045760131869e-06, "loss": 0.0, "num_tokens": 115026552.0, "reward": 0.4845491349697113, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4845491349697113, "rewards/reward_fn/std": 0.9969884157180786, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07141732424497604, "epoch": 0.20832, "grad_norm": 0.0, "learning_rate": 3.2658144511674073e-06, "loss": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.953125, "completions/mean_terminated_length": 190.73585510253906, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06547491997480392, "epoch": 0.2084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.265583036111944e-06, "loss": 0.0, "num_tokens": 115117938.0, "reward": 0.0124883484095335, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0124883484095335, "rewards/reward_fn/std": 0.022285200655460358, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696883536875248, "epoch": 0.20848, "grad_norm": 0.0, "learning_rate": 3.265351514981676e-06, "loss": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 216.61538696289062, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06974289193749428, "epoch": 0.20856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.265119887792807e-06, "loss": 0.0, "num_tokens": 115212146.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06833437457680702, "epoch": 0.20864, "grad_norm": 0.0, "learning_rate": 3.264888154561548e-06, "loss": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.453125, "completions/mean_terminated_length": 231.6666717529297, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.0793875940144062, "epoch": 0.20872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.264656315304119e-06, "loss": 0.0, "num_tokens": 115308844.0, "reward": 0.12230706959962845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12230706959962845, "rewards/reward_fn/std": 0.32486557960510254, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0795474424958229, "epoch": 0.2088, "grad_norm": 0.0, "learning_rate": 3.2644243700367457e-06, "loss": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.0859375, "completions/mean_terminated_length": 222.89610290527344, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06933768466114998, "epoch": 0.20888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2641923187756614e-06, "loss": 0.0, "num_tokens": 115404599.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07304384559392929, "epoch": 0.20896, "grad_norm": 0.0, "learning_rate": 3.263960161537108e-06, "loss": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.4375, "completions/mean_terminated_length": 212.53334045410156, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06582217290997505, "epoch": 0.20904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.263727898337334e-06, "loss": 0.0, "num_tokens": 115498991.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06573182344436646, "epoch": 0.20912, "grad_norm": 0.0, "learning_rate": 3.263495529192595e-06, "loss": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.5234375, "completions/mean_terminated_length": 221.98387145996094, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.06142580136656761, "epoch": 0.2092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.263263054119154e-06, "loss": 0.0, "num_tokens": 115595186.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06108321622014046, "epoch": 0.20928, "grad_norm": 0.0, "learning_rate": 3.263030473133282e-06, "loss": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.59375, "completions/mean_terminated_length": 224.12245178222656, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.073209747672081, "epoch": 0.20936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.262797786251259e-06, "loss": 0.0, "num_tokens": 115690366.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07096951454877853, "epoch": 0.20944, "grad_norm": 0.0, "learning_rate": 3.2625649934893676e-06, "loss": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.828125, "completions/mean_terminated_length": 220.5688018798828, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06730474159121513, "epoch": 0.20952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.262332094863903e-06, "loss": 0.0, "num_tokens": 115784808.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805114448070526, "epoch": 0.2096, "grad_norm": 0.0, "learning_rate": 3.2620990903911653e-06, "loss": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.2265625, "completions/mean_terminated_length": 213.6913604736328, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07543598487973213, "epoch": 0.20968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2618659800874616e-06, "loss": 0.0, "num_tokens": 115879685.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07855229079723358, "epoch": 0.20976, "grad_norm": 0.0, "learning_rate": 3.261632763969108e-06, "loss": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.34375, "completions/mean_terminated_length": 190.29884338378906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06268522329628468, "epoch": 0.20984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2613994420524264e-06, "loss": 0.0, "num_tokens": 115972273.0, "reward": 0.4633024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4633024334907532, "rewards/reward_fn/std": 0.9901458024978638, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06284216791391373, "epoch": 0.20992, "grad_norm": 0.0, "learning_rate": 3.2611660143537476e-06, "loss": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.3046875, "completions/mean_terminated_length": 236.98611450195312, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06935608386993408, "epoch": 0.21, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2609324808894084e-06, "loss": 0.0, "num_tokens": 116069208.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06908159703016281, "epoch": 0.21008, "grad_norm": 0.0, "learning_rate": 3.2606988416757543e-06, "loss": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.1015625, "completions/mean_terminated_length": 241.13235473632812, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.06925047934055328, "epoch": 0.21016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.260465096729137e-06, "loss": 0.0, "num_tokens": 116166501.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06821852922439575, "epoch": 0.21024, "grad_norm": 0.0, "learning_rate": 3.2602312460659167e-06, "loss": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 177.859375, "completions/mean_terminated_length": 159.82693481445312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07837484404444695, "epoch": 0.21032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.25999728970246e-06, "loss": 0.0, "num_tokens": 116254803.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07780642062425613, "epoch": 0.2104, "grad_norm": 0.0, "learning_rate": 3.2597632276551417e-06, "loss": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.703125, "completions/mean_terminated_length": 221.32557678222656, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06566295027732849, "epoch": 0.21048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2595290599403433e-06, "loss": 0.0, "num_tokens": 116350125.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06429225951433182, "epoch": 0.21056, "grad_norm": 0.0, "learning_rate": 3.259294786574455e-06, "loss": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.046875, "completions/mean_terminated_length": 214.87356567382812, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06849836185574532, "epoch": 0.21064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2590604075738723e-06, "loss": 0.0, "num_tokens": 116444851.0, "reward": 0.759978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.759978711605072, "rewards/reward_fn/std": 1.2986160516738892, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06801358237862587, "epoch": 0.21072, "grad_norm": 0.0, "learning_rate": 3.258825922955e-06, "loss": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.796875, "completions/mean_terminated_length": 244.81689453125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.07044532522559166, "epoch": 0.2108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.258591332734249e-06, "loss": 0.0, "num_tokens": 116542361.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07186214998364449, "epoch": 0.21088, "grad_norm": 0.0, "learning_rate": 3.2583566369280388e-06, "loss": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.859375, "completions/mean_terminated_length": 240.40000915527344, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07993130013346672, "epoch": 0.21096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.258121835552795e-06, "loss": 0.0, "num_tokens": 116639495.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07707921415567398, "epoch": 0.21104, "grad_norm": 0.0, "learning_rate": 3.257886928624952e-06, "loss": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.15625, "completions/mean_terminated_length": 238.6896514892578, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07265311107039452, "epoch": 0.21112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2576519161609496e-06, "loss": 0.0, "num_tokens": 116736795.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07436834275722504, "epoch": 0.2112, "grad_norm": 0.0, "learning_rate": 3.257416798177237e-06, "loss": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.8671875, "completions/mean_terminated_length": 193.26136779785156, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06441478058695793, "epoch": 0.21128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2571815746902706e-06, "loss": 0.0, "num_tokens": 116829578.0, "reward": 0.46573716402053833, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46573716402053833, "rewards/reward_fn/std": 0.9907692670822144, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061792368069291115, "epoch": 0.21136, "grad_norm": 0.0, "learning_rate": 3.256946245716512e-06, "loss": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 247.4453125, "completions/mean_terminated_length": 231.6666717529297, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07746841758489609, "epoch": 0.21144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2567108112724325e-06, "loss": 0.0, "num_tokens": 116926787.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07590461522340775, "epoch": 0.21152, "grad_norm": 0.0, "learning_rate": 3.256475271374511e-06, "loss": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 214.6206817626953, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.06808391213417053, "epoch": 0.2116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2562396260392314e-06, "loss": 0.0, "num_tokens": 117021491.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06485695391893387, "epoch": 0.21168, "grad_norm": 0.0, "learning_rate": 3.2560038752830865e-06, "loss": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.765625, "completions/mean_terminated_length": 199.6381072998047, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06930850446224213, "epoch": 0.21176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.255768019122577e-06, "loss": 0.0, "num_tokens": 117113877.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06825892999768257, "epoch": 0.21184, "grad_norm": 0.0, "learning_rate": 3.25553205757421e-06, "loss": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 240.88095092773438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.07051936537027359, "epoch": 0.21192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2552959906545003e-06, "loss": 0.0, "num_tokens": 117210911.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06700501590967178, "epoch": 0.212, "grad_norm": 0.0, "learning_rate": 3.25505981837997e-06, "loss": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.0390625, "completions/mean_terminated_length": 183.95774841308594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.0627469252794981, "epoch": 0.21208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2548235407671497e-06, "loss": 0.0, "num_tokens": 117304100.0, "reward": 0.799616277217865, "reward_std": 0.0, "rewards/reward_fn/mean": 0.799616277217865, "rewards/reward_fn/std": 1.281852126121521, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061099693179130554, "epoch": 0.21216, "grad_norm": 0.0, "learning_rate": 3.2545871578325746e-06, "loss": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.2890625, "completions/mean_terminated_length": 229.489990234375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.0655999481678009, "epoch": 0.21224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2543506695927907e-06, "loss": 0.0, "num_tokens": 117399753.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06594199314713478, "epoch": 0.21232, "grad_norm": 0.0, "learning_rate": 3.2541140760643484e-06, "loss": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5078125, "completions/mean_terminated_length": 238.277099609375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.067926324903965, "epoch": 0.2124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2538773772638074e-06, "loss": 0.0, "num_tokens": 117496586.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0671619176864624, "epoch": 0.21248, "grad_norm": 0.0, "learning_rate": 3.2536405732077337e-06, "loss": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 205.0234375, "completions/mean_terminated_length": 193.85714721679688, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06337057054042816, "epoch": 0.21256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2534036639127016e-06, "loss": 0.0, "num_tokens": 117588365.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06077073700726032, "epoch": 0.21264, "grad_norm": 0.0, "learning_rate": 3.2531666493952917e-06, "loss": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.03125, "completions/mean_terminated_length": 238.02816772460938, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07631287723779678, "epoch": 0.21272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2529295296720928e-06, "loss": 0.0, "num_tokens": 117685393.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07422871887683868, "epoch": 0.2128, "grad_norm": 0.0, "learning_rate": 3.2526923047597007e-06, "loss": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 250.3203125, "completions/mean_terminated_length": 242.7818145751953, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.0733381137251854, "epoch": 0.21288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.252454974674719e-06, "loss": 0.0, "num_tokens": 117782970.0, "reward": 0.09723600745201111, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09723600745201111, "rewards/reward_fn/std": 0.2582731544971466, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07373291254043579, "epoch": 0.21296, "grad_norm": 0.0, "learning_rate": 3.252217539433758e-06, "loss": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.3125, "completions/mean_terminated_length": 203.9199981689453, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.05980619415640831, "epoch": 0.21304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2519799990534354e-06, "loss": 0.0, "num_tokens": 117876066.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05953189916908741, "epoch": 0.21312, "grad_norm": 0.0, "learning_rate": 3.2517423535503765e-06, "loss": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.3359375, "completions/mean_terminated_length": 223.79412841796875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07126254960894585, "epoch": 0.2132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.251504602941215e-06, "loss": 0.0, "num_tokens": 117971085.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07259175553917885, "epoch": 0.21328, "grad_norm": 0.0, "learning_rate": 3.25126674724259e-06, "loss": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.59375, "completions/mean_terminated_length": 201.68223571777344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07635306939482689, "epoch": 0.21336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2510287864711484e-06, "loss": 0.0, "num_tokens": 118063577.0, "reward": 0.11482523381710052, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11482523381710052, "rewards/reward_fn/std": 0.3049927055835724, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07882532477378845, "epoch": 0.21344, "grad_norm": 0.0, "learning_rate": 3.2507907206435465e-06, "loss": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.546875, "completions/mean_terminated_length": 213.6511688232422, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06041034869849682, "epoch": 0.21352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.250552549776445e-06, "loss": 0.0, "num_tokens": 118158239.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06533141434192657, "epoch": 0.2136, "grad_norm": 0.0, "learning_rate": 3.250314273886514e-06, "loss": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.65625, "completions/mean_terminated_length": 229.6666717529297, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06970182061195374, "epoch": 0.21368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.25007589299043e-06, "loss": 0.0, "num_tokens": 118254963.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07048561424016953, "epoch": 0.21376, "grad_norm": 0.0, "learning_rate": 3.249837407104877e-06, "loss": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.953125, "completions/mean_terminated_length": 191.1566162109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07015795260667801, "epoch": 0.21384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2495988162465473e-06, "loss": 0.0, "num_tokens": 118347885.0, "reward": 0.47340086102485657, "reward_std": 0.0, "rewards/reward_fn/mean": 0.47340086102485657, "rewards/reward_fn/std": 0.9740652441978455, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07199037075042725, "epoch": 0.21392, "grad_norm": 0.0, "learning_rate": 3.249360120432139e-06, "loss": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.921875, "completions/mean_terminated_length": 229.28712463378906, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06936635822057724, "epoch": 0.214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2491213196783585e-06, "loss": 0.0, "num_tokens": 118443491.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07007061317563057, "epoch": 0.21408, "grad_norm": 0.0, "learning_rate": 3.24888241400192e-06, "loss": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.4453125, "completions/mean_terminated_length": 232.36764526367188, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07162592932581902, "epoch": 0.21416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2486434034195427e-06, "loss": 0.0, "num_tokens": 118540188.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06891312077641487, "epoch": 0.21424, "grad_norm": 0.0, "learning_rate": 3.248404287947956e-06, "loss": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.7109375, "completions/mean_terminated_length": 229.2692413330078, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07120274752378464, "epoch": 0.21432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2481650676038956e-06, "loss": 0.0, "num_tokens": 118636407.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07198010012507439, "epoch": 0.2144, "grad_norm": 0.0, "learning_rate": 3.2479257424041044e-06, "loss": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.5546875, "completions/mean_terminated_length": 237.21795654296875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.07251390069723129, "epoch": 0.21448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.247686312365332e-06, "loss": 0.0, "num_tokens": 118733246.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07162313163280487, "epoch": 0.21456, "grad_norm": 0.0, "learning_rate": 3.247446777504336e-06, "loss": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.984375, "completions/mean_terminated_length": 224.98948669433594, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06735068559646606, "epoch": 0.21464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.247207137837882e-06, "loss": 0.0, "num_tokens": 118828604.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06587698310613632, "epoch": 0.21472, "grad_norm": 0.0, "learning_rate": 3.2469673933827413e-06, "loss": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.390625, "completions/mean_terminated_length": 217.79661560058594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06157033331692219, "epoch": 0.2148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.246727544155694e-06, "loss": 0.0, "num_tokens": 118924654.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0609164759516716, "epoch": 0.21488, "grad_norm": 0.0, "learning_rate": 3.246487590173528e-06, "loss": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 226.85057067871094, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.07602755725383759, "epoch": 0.21496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.246247531453036e-06, "loss": 0.0, "num_tokens": 119020422.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07599475607275963, "epoch": 0.21504, "grad_norm": 0.0, "learning_rate": 3.2460073680110203e-06, "loss": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.5703125, "completions/mean_terminated_length": 210.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06682197377085686, "epoch": 0.21512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2457670998642896e-06, "loss": 0.0, "num_tokens": 119114319.0, "reward": 0.39967191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.39967191219329834, "rewards/reward_fn/std": 0.988822877407074, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06555794179439545, "epoch": 0.2152, "grad_norm": 0.0, "learning_rate": 3.24552672702966e-06, "loss": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.90625, "completions/mean_terminated_length": 210.8648681640625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07593837380409241, "epoch": 0.21528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2452862495239556e-06, "loss": 0.0, "num_tokens": 119209283.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696023628115654, "epoch": 0.21536, "grad_norm": 0.0, "learning_rate": 3.245045667364006e-06, "loss": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.859375, "completions/mean_terminated_length": 242.57142639160156, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07060681283473969, "epoch": 0.21544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2448049805666516e-06, "loss": 0.0, "num_tokens": 119306929.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07357174530625343, "epoch": 0.21552, "grad_norm": 0.0, "learning_rate": 3.244564189148736e-06, "loss": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 169.75608825683594, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07009577751159668, "epoch": 0.2156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.244323293127113e-06, "loss": 0.0, "num_tokens": 119398161.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06630738824605942, "epoch": 0.21568, "grad_norm": 0.0, "learning_rate": 3.2440822925186422e-06, "loss": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 203.99998474121094, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07868340238928795, "epoch": 0.21576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2438411873401915e-06, "loss": 0.0, "num_tokens": 119490797.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07374867051839828, "epoch": 0.21584, "grad_norm": 0.0, "learning_rate": 3.243599977608636e-06, "loss": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 230.2784881591797, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07763771712779999, "epoch": 0.21592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2433586633408573e-06, "loss": 0.0, "num_tokens": 119587069.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07617365941405296, "epoch": 0.216, "grad_norm": 0.0, "learning_rate": 3.243117244553745e-06, "loss": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 209.421875, "completions/mean_terminated_length": 191.19564819335938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0705752857029438, "epoch": 0.21608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2428757212641953e-06, "loss": 0.0, "num_tokens": 119679411.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07006118446588516, "epoch": 0.21616, "grad_norm": 0.0, "learning_rate": 3.2426340934891132e-06, "loss": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.328125, "completions/mean_terminated_length": 220.43589782714844, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06753483787178993, "epoch": 0.21624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2423923612454102e-06, "loss": 0.0, "num_tokens": 119774941.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0698690377175808, "epoch": 0.21632, "grad_norm": 0.0, "learning_rate": 3.2421505245500045e-06, "loss": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.9921875, "completions/mean_terminated_length": 220.43055725097656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06868430972099304, "epoch": 0.2164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2419085834198223e-06, "loss": 0.0, "num_tokens": 119870684.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07177572324872017, "epoch": 0.21648, "grad_norm": 0.0, "learning_rate": 3.2416665378717962e-06, "loss": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6484375, "completions/mean_terminated_length": 216.3913116455078, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06723615154623985, "epoch": 0.21656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2414243879228674e-06, "loss": 0.0, "num_tokens": 119966255.0, "reward": 0.15768590569496155, "reward_std": 0.0, "rewards/reward_fn/mean": 0.15768590569496155, "rewards/reward_fn/std": 0.32778966426849365, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07180137559771538, "epoch": 0.21664, "grad_norm": 0.0, "learning_rate": 3.241182133589984e-06, "loss": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 242.6015625, "completions/mean_terminated_length": 237.35870361328125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06260482035577297, "epoch": 0.21672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.240939774890102e-06, "loss": 0.0, "num_tokens": 120062844.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06381545960903168, "epoch": 0.2168, "grad_norm": 0.0, "learning_rate": 3.240697311840182e-06, "loss": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.40625, "completions/mean_terminated_length": 233.9746856689453, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.0716540738940239, "epoch": 0.21688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2404547444571952e-06, "loss": 0.0, "num_tokens": 120159408.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06928262859582901, "epoch": 0.21696, "grad_norm": 0.0, "learning_rate": 3.2402120727581183e-06, "loss": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.4296875, "completions/mean_terminated_length": 237.71591186523438, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07040796801447868, "epoch": 0.21704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2399692967599363e-06, "loss": 0.0, "num_tokens": 120256103.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06986509636044502, "epoch": 0.21712, "grad_norm": 0.0, "learning_rate": 3.23972641647964e-06, "loss": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1640625, "completions/mean_terminated_length": 236.04629516601562, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.0640086717903614, "epoch": 0.2172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2394834319342297e-06, "loss": 0.0, "num_tokens": 120352252.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06467096135020256, "epoch": 0.21728, "grad_norm": 0.0, "learning_rate": 3.239240343140711e-06, "loss": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.359375, "completions/mean_terminated_length": 203.40260314941406, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07314939051866531, "epoch": 0.21736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.238997150116097e-06, "loss": 0.0, "num_tokens": 120446506.0, "reward": 0.6921311616897583, "reward_std": 0.0, "rewards/reward_fn/mean": 0.6921311616897583, "rewards/reward_fn/std": 1.0914734601974487, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06848156452178955, "epoch": 0.21744, "grad_norm": 0.0, "learning_rate": 3.2387538528774098e-06, "loss": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 208.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06660384684801102, "epoch": 0.21752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2385104514416768e-06, "loss": 0.0, "num_tokens": 120541066.0, "reward": 0.07643959671258926, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07643959671258926, "rewards/reward_fn/std": 0.19555726647377014, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06651538237929344, "epoch": 0.2176, "grad_norm": 0.0, "learning_rate": 3.2382669458259344e-06, "loss": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.921875, "completions/mean_terminated_length": 234.1063690185547, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07080836221575737, "epoch": 0.21768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2380233360472243e-06, "loss": 0.0, "num_tokens": 120637312.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06961572170257568, "epoch": 0.21776, "grad_norm": 0.0, "learning_rate": 3.237779622122597e-06, "loss": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.1328125, "completions/mean_terminated_length": 240.97999572753906, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06523270905017853, "epoch": 0.21784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.237535804069111e-06, "loss": 0.0, "num_tokens": 120734865.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06234206631779671, "epoch": 0.21792, "grad_norm": 0.0, "learning_rate": 3.2372918819038295e-06, "loss": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.1953125, "completions/mean_terminated_length": 242.25608825683594, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06344447284936905, "epoch": 0.218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.237047855643825e-06, "loss": 0.0, "num_tokens": 120832042.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06308842450380325, "epoch": 0.21808, "grad_norm": 0.0, "learning_rate": 3.236803725306177e-06, "loss": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.1875, "completions/mean_terminated_length": 237.3333282470703, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07113511487841606, "epoch": 0.21816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2365594909079713e-06, "loss": 0.0, "num_tokens": 120928834.0, "reward": 0.047493621706962585, "reward_std": 0.0, "rewards/reward_fn/mean": 0.047493621706962585, "rewards/reward_fn/std": 0.12615004181861877, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06940006464719772, "epoch": 0.21824, "grad_norm": 0.0, "learning_rate": 3.236315152466303e-06, "loss": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.53125, "completions/mean_terminated_length": 226.57894897460938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06656219065189362, "epoch": 0.21832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.236070709998272e-06, "loss": 0.0, "num_tokens": 121024902.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06607618927955627, "epoch": 0.2184, "grad_norm": 0.0, "learning_rate": 3.2358261635209872e-06, "loss": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.203125, "completions/mean_terminated_length": 236.26504516601562, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.06792446225881577, "epoch": 0.21848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2355815130515646e-06, "loss": 0.0, "num_tokens": 121121568.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664018914103508, "epoch": 0.21856, "grad_norm": 0.0, "learning_rate": 3.2353367586071264e-06, "loss": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.2578125, "completions/mean_terminated_length": 240.01612854003906, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06985846906900406, "epoch": 0.21864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2350919002048033e-06, "loss": 0.0, "num_tokens": 121218881.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07065547630190849, "epoch": 0.21872, "grad_norm": 0.0, "learning_rate": 3.234846937861733e-06, "loss": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.859375, "completions/mean_terminated_length": 204.6199951171875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07036329805850983, "epoch": 0.2188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2346018715950597e-06, "loss": 0.0, "num_tokens": 121312047.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06825857982039452, "epoch": 0.21888, "grad_norm": 0.0, "learning_rate": 3.234356701421936e-06, "loss": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.4921875, "completions/mean_terminated_length": 215.38710021972656, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06902635842561722, "epoch": 0.21896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2341114273595207e-06, "loss": 0.0, "num_tokens": 121406574.0, "reward": 0.4421311914920807, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4421311914920807, "rewards/reward_fn/std": 0.98649662733078, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07416459545493126, "epoch": 0.21904, "grad_norm": 0.0, "learning_rate": 3.2338660494249806e-06, "loss": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.953125, "completions/mean_terminated_length": 207.5849151611328, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07056821882724762, "epoch": 0.21912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2336205676354903e-06, "loss": 0.0, "num_tokens": 121502312.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07108037918806076, "epoch": 0.2192, "grad_norm": 0.0, "learning_rate": 3.2333749820082297e-06, "loss": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.4296875, "completions/mean_terminated_length": 226.74444580078125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07024221122264862, "epoch": 0.21928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2331292925603872e-06, "loss": 0.0, "num_tokens": 121597983.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0676814392209053, "epoch": 0.21936, "grad_norm": 0.0, "learning_rate": 3.23288349930916e-06, "loss": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.1171875, "completions/mean_terminated_length": 218.5978240966797, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06582389026880264, "epoch": 0.21944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2326376022717498e-06, "loss": 0.0, "num_tokens": 121692846.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06503012031316757, "epoch": 0.21952, "grad_norm": 0.0, "learning_rate": 3.2323916014653668e-06, "loss": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.7578125, "completions/mean_terminated_length": 241.59341430664062, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06355991214513779, "epoch": 0.2196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.232145496907229e-06, "loss": 0.0, "num_tokens": 121789839.0, "reward": 0.07393992692232132, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07393992692232132, "rewards/reward_fn/std": 0.19639533758163452, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06425075605511665, "epoch": 0.21968, "grad_norm": 0.0, "learning_rate": 3.231899288614561e-06, "loss": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.1640625, "completions/mean_terminated_length": 228.60227966308594, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06818477809429169, "epoch": 0.21976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2316529766045943e-06, "loss": 0.0, "num_tokens": 121885732.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06923758611083031, "epoch": 0.21984, "grad_norm": 0.0, "learning_rate": 3.231406560894568e-06, "loss": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.859375, "completions/mean_terminated_length": 222.04396057128906, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07599321752786636, "epoch": 0.21992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.23116004150173e-06, "loss": 0.0, "num_tokens": 121980946.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07445700839161873, "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 3.2309134184433324e-06, "loss": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.2734375, "completions/mean_terminated_length": 211.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06954498961567879, "epoch": 0.22008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.230666691736637e-06, "loss": 0.0, "num_tokens": 122075957.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06987742334604263, "epoch": 0.22016, "grad_norm": 0.0, "learning_rate": 3.230419861398912e-06, "loss": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 230.72000122070312, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06286659836769104, "epoch": 0.22024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.230172927447433e-06, "loss": 0.0, "num_tokens": 122171733.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06509474664926529, "epoch": 0.22032, "grad_norm": 0.0, "learning_rate": 3.229925889899483e-06, "loss": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 220.09754943847656, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.066129669547081, "epoch": 0.2204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.229678748772351e-06, "loss": 0.0, "num_tokens": 122268565.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06156606413424015, "epoch": 0.22048, "grad_norm": 0.0, "learning_rate": 3.2294315040833356e-06, "loss": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.203125, "completions/mean_terminated_length": 238.75790405273438, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06910840049386024, "epoch": 0.22056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2291841558497402e-06, "loss": 0.0, "num_tokens": 122365231.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07012134790420532, "epoch": 0.22064, "grad_norm": 0.0, "learning_rate": 3.228936704088877e-06, "loss": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.1328125, "completions/mean_terminated_length": 233.44644165039062, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06702737510204315, "epoch": 0.22072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2286891488180654e-06, "loss": 0.0, "num_tokens": 122462272.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06490461900830269, "epoch": 0.2208, "grad_norm": 0.0, "learning_rate": 3.228441490054631e-06, "loss": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.3515625, "completions/mean_terminated_length": 212.22666931152344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06869960203766823, "epoch": 0.22088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.228193727815908e-06, "loss": 0.0, "num_tokens": 122557293.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06824060529470444, "epoch": 0.22096, "grad_norm": 0.0, "learning_rate": 3.227945862119236e-06, "loss": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.828125, "completions/mean_terminated_length": 224.15383911132812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06966116279363632, "epoch": 0.22104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.227697892981964e-06, "loss": 0.0, "num_tokens": 122653527.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06853602454066277, "epoch": 0.22112, "grad_norm": 0.0, "learning_rate": 3.2274498204214473e-06, "loss": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.90625, "completions/mean_terminated_length": 183.85000610351562, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07645103707909584, "epoch": 0.2212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2272016444550475e-06, "loss": 0.0, "num_tokens": 122746059.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07387887686491013, "epoch": 0.22128, "grad_norm": 0.0, "learning_rate": 3.2269533651001353e-06, "loss": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.9375, "completions/mean_terminated_length": 222.971435546875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07148075476288795, "epoch": 0.22136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2267049823740866e-06, "loss": 0.0, "num_tokens": 122842051.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06919839978218079, "epoch": 0.22144, "grad_norm": 0.0, "learning_rate": 3.2264564962942865e-06, "loss": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.7109375, "completions/mean_terminated_length": 222.90476989746094, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06791483238339424, "epoch": 0.22152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2262079068781256e-06, "loss": 0.0, "num_tokens": 122938270.0, "reward": 0.4489399194717407, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4489399194717407, "rewards/reward_fn/std": 0.9873224496841431, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06842394918203354, "epoch": 0.2216, "grad_norm": 0.0, "learning_rate": 3.2259592141430035e-06, "loss": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.171875, "completions/mean_terminated_length": 197.6626434326172, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.062083955854177475, "epoch": 0.22168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2257104181063253e-06, "loss": 0.0, "num_tokens": 123031732.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0645076259970665, "epoch": 0.22176, "grad_norm": 0.0, "learning_rate": 3.2254615187855034e-06, "loss": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5234375, "completions/mean_terminated_length": 217.64614868164062, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06836194545030594, "epoch": 0.22184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.22521251619796e-06, "loss": 0.0, "num_tokens": 123127543.0, "reward": 0.1315361112356186, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1315361112356186, "rewards/reward_fn/std": 0.3065848648548126, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06817800551652908, "epoch": 0.22192, "grad_norm": 0.0, "learning_rate": 3.2249634103611216e-06, "loss": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.984375, "completions/mean_terminated_length": 205.40740966796875, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.07212316244840622, "epoch": 0.222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.224714201292423e-06, "loss": 0.0, "num_tokens": 123221749.0, "reward": 0.8057804703712463, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8057804703712463, "rewards/reward_fn/std": 1.2800037860870361, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07358819991350174, "epoch": 0.22208, "grad_norm": 0.0, "learning_rate": 3.2244648890093057e-06, "loss": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 212.21054077148438, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06519945710897446, "epoch": 0.22216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.22421547352922e-06, "loss": 0.0, "num_tokens": 123315893.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06361385062336922, "epoch": 0.22224, "grad_norm": 0.0, "learning_rate": 3.2239659548696215e-06, "loss": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 213.27659606933594, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.0719456821680069, "epoch": 0.22232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2237163330479747e-06, "loss": 0.0, "num_tokens": 123410181.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06692329794168472, "epoch": 0.2224, "grad_norm": 0.0, "learning_rate": 3.223466608081749e-06, "loss": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 184.328125, "completions/mean_terminated_length": 178.90757751464844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.08483640104532242, "epoch": 0.22248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2232167799884242e-06, "loss": 0.0, "num_tokens": 123499311.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08571537211537361, "epoch": 0.22256, "grad_norm": 0.0, "learning_rate": 3.2229668487854845e-06, "loss": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 249.8515625, "completions/mean_terminated_length": 242.4310302734375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.0771227702498436, "epoch": 0.22264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.222716814490423e-06, "loss": 0.0, "num_tokens": 123596828.0, "reward": 0.04093467444181442, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04093467444181442, "rewards/reward_fn/std": 0.10872851312160492, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08003692328929901, "epoch": 0.22272, "grad_norm": 0.0, "learning_rate": 3.22246667712074e-06, "loss": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.0390625, "completions/mean_terminated_length": 214.3837127685547, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0748610720038414, "epoch": 0.2228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.222216436693941e-06, "loss": 0.0, "num_tokens": 123691553.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07060778886079788, "epoch": 0.22288, "grad_norm": 0.0, "learning_rate": 3.221966093227541e-06, "loss": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.2109375, "completions/mean_terminated_length": 181.89535522460938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06912350282073021, "epoch": 0.22296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2217156467390615e-06, "loss": 0.0, "num_tokens": 123783484.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07033228501677513, "epoch": 0.22304, "grad_norm": 0.0, "learning_rate": 3.2214650972460308e-06, "loss": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.0625, "completions/mean_terminated_length": 201.65516662597656, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07651474326848984, "epoch": 0.22312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2212144447659855e-06, "loss": 0.0, "num_tokens": 123877060.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07512819766998291, "epoch": 0.2232, "grad_norm": 0.0, "learning_rate": 3.2209636893164677e-06, "loss": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.6484375, "completions/mean_terminated_length": 232.40298461914062, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06685378402471542, "epoch": 0.22328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2207128309150278e-06, "loss": 0.0, "num_tokens": 123973783.0, "reward": 0.11723288148641586, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11723288148641586, "rewards/reward_fn/std": 0.3113877773284912, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06526162102818489, "epoch": 0.22336, "grad_norm": 0.0, "learning_rate": 3.220461869579224e-06, "loss": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.484375, "completions/mean_terminated_length": 212.17544555664062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07327145338058472, "epoch": 0.22344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2202108053266195e-06, "loss": 0.0, "num_tokens": 124069589.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06984524801373482, "epoch": 0.22352, "grad_norm": 0.0, "learning_rate": 3.2199596381747874e-06, "loss": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.546875, "completions/mean_terminated_length": 175.44232177734375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.0633986871689558, "epoch": 0.2236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2197083681413064e-06, "loss": 0.0, "num_tokens": 124159515.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06017015874385834, "epoch": 0.22368, "grad_norm": 0.0, "learning_rate": 3.2194569952437623e-06, "loss": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4140625, "completions/mean_terminated_length": 240.94444274902344, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07783432677388191, "epoch": 0.22376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2192055194997497e-06, "loss": 0.0, "num_tokens": 124256464.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.075436320155859, "epoch": 0.22384, "grad_norm": 0.0, "learning_rate": 3.2189539409268677e-06, "loss": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.578125, "completions/mean_terminated_length": 210.90667724609375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06728378683328629, "epoch": 0.22392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2187022595427254e-06, "loss": 0.0, "num_tokens": 124351386.0, "reward": 0.4327646493911743, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4327646493911743, "rewards/reward_fn/std": 0.985901951789856, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07063126936554909, "epoch": 0.224, "grad_norm": 0.0, "learning_rate": 3.218450475364937e-06, "loss": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6875, "completions/mean_terminated_length": 228.00001525878906, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06393388286232948, "epoch": 0.22408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2181985884111256e-06, "loss": 0.0, "num_tokens": 124446834.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06737193465232849, "epoch": 0.22416, "grad_norm": 0.0, "learning_rate": 3.21794659869892e-06, "loss": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.78125, "completions/mean_terminated_length": 205.70730590820312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07106424868106842, "epoch": 0.22424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2176945062459562e-06, "loss": 0.0, "num_tokens": 124541014.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07392846792936325, "epoch": 0.22432, "grad_norm": 0.0, "learning_rate": 3.217442311069879e-06, "loss": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.9375, "completions/mean_terminated_length": 224.59573364257812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06892070174217224, "epoch": 0.2244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.21719001318834e-06, "loss": 0.0, "num_tokens": 124636366.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06907587870955467, "epoch": 0.22448, "grad_norm": 0.0, "learning_rate": 3.2169376126189957e-06, "loss": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2109375, "completions/mean_terminated_length": 235.32876586914062, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06915667280554771, "epoch": 0.22456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2166851093795124e-06, "loss": 0.0, "num_tokens": 124733161.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06796327233314514, "epoch": 0.22464, "grad_norm": 0.0, "learning_rate": 3.2164325034875624e-06, "loss": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.8671875, "completions/mean_terminated_length": 238.55223083496094, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06837120652198792, "epoch": 0.22472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2161797949608263e-06, "loss": 0.0, "num_tokens": 124830296.0, "reward": 0.0722954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0722954273223877, "rewards/reward_fn/std": 0.192027285695076, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07022898644208908, "epoch": 0.2248, "grad_norm": 0.0, "learning_rate": 3.21592698381699e-06, "loss": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6640625, "completions/mean_terminated_length": 240.6794891357422, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06774772331118584, "epoch": 0.22488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2156740700737483e-06, "loss": 0.0, "num_tokens": 124927405.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06696795299649239, "epoch": 0.22496, "grad_norm": 0.0, "learning_rate": 3.2154210537488015e-06, "loss": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.703125, "completions/mean_terminated_length": 228.3616943359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.0654337890446186, "epoch": 0.22504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2151679348598584e-06, "loss": 0.0, "num_tokens": 125023111.0, "reward": 0.46803462505340576, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46803462505340576, "rewards/reward_fn/std": 0.9913957715034485, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06406961753964424, "epoch": 0.22512, "grad_norm": 0.0, "learning_rate": 3.2149147134246356e-06, "loss": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.609375, "completions/mean_terminated_length": 233.68084716796875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06857404112815857, "epoch": 0.2252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2146613894608557e-06, "loss": 0.0, "num_tokens": 125119317.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06806233897805214, "epoch": 0.22528, "grad_norm": 0.0, "learning_rate": 3.214407962986248e-06, "loss": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.1015625, "completions/mean_terminated_length": 225.7624969482422, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.0676349550485611, "epoch": 0.22536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2141544340185493e-06, "loss": 0.0, "num_tokens": 125215202.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06094682030379772, "epoch": 0.22544, "grad_norm": 0.0, "learning_rate": 3.213900802575505e-06, "loss": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.828125, "completions/mean_terminated_length": 228.86419677734375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06941653788089752, "epoch": 0.22552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2136470686748664e-06, "loss": 0.0, "num_tokens": 125311308.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06828365474939346, "epoch": 0.2256, "grad_norm": 0.0, "learning_rate": 3.2133932323343917e-06, "loss": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.6015625, "completions/mean_terminated_length": 191.08929443359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07145781815052032, "epoch": 0.22568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.213139293571848e-06, "loss": 0.0, "num_tokens": 125405977.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06965097784996033, "epoch": 0.22576, "grad_norm": 0.0, "learning_rate": 3.2128852524050067e-06, "loss": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.3515625, "completions/mean_terminated_length": 205.3625030517578, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.0724780410528183, "epoch": 0.22584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2126311088516486e-06, "loss": 0.0, "num_tokens": 125500230.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07287590578198433, "epoch": 0.22592, "grad_norm": 0.0, "learning_rate": 3.212376862929561e-06, "loss": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.171875, "completions/mean_terminated_length": 236.33766174316406, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06653035059571266, "epoch": 0.226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.212122514656539e-06, "loss": 0.0, "num_tokens": 125597020.0, "reward": 0.8206124305725098, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8206124305725098, "rewards/reward_fn/std": 1.2764060497283936, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06696444377303123, "epoch": 0.22608, "grad_norm": 0.0, "learning_rate": 3.211868064050384e-06, "loss": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.6328125, "completions/mean_terminated_length": 222.3932647705078, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06991375237703323, "epoch": 0.22616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2116135111289047e-06, "loss": 0.0, "num_tokens": 125692333.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0684896782040596, "epoch": 0.22624, "grad_norm": 0.0, "learning_rate": 3.211358855909917e-06, "loss": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.3515625, "completions/mean_terminated_length": 235.4166717529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07581744343042374, "epoch": 0.22632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2111040984112445e-06, "loss": 0.0, "num_tokens": 125789402.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07610888034105301, "epoch": 0.2264, "grad_norm": 0.0, "learning_rate": 3.2108492386507173e-06, "loss": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.8984375, "completions/mean_terminated_length": 232.98809814453125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.0675412118434906, "epoch": 0.22648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.210594276646173e-06, "loss": 0.0, "num_tokens": 125885773.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06885706260800362, "epoch": 0.22656, "grad_norm": 0.0, "learning_rate": 3.210339212415455e-06, "loss": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 213.6585235595703, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.060322536155581474, "epoch": 0.22664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2100840459764172e-06, "loss": 0.0, "num_tokens": 125980605.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664897970855236, "epoch": 0.22672, "grad_norm": 0.0, "learning_rate": 3.2098287773469174e-06, "loss": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.1875, "completions/mean_terminated_length": 215.62353515625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07101230695843697, "epoch": 0.2268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2095734065448222e-06, "loss": 0.0, "num_tokens": 126075477.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 1.3280736207962036, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0707419291138649, "epoch": 0.22688, "grad_norm": 0.0, "learning_rate": 3.2093179335880044e-06, "loss": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4921875, "completions/mean_terminated_length": 240.1764678955078, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.07610062882304192, "epoch": 0.22696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2090623584943445e-06, "loss": 0.0, "num_tokens": 126172436.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07405189424753189, "epoch": 0.22704, "grad_norm": 0.0, "learning_rate": 3.20880668128173e-06, "loss": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 240.3478240966797, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07288027554750443, "epoch": 0.22712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.208550901968056e-06, "loss": 0.0, "num_tokens": 126270020.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07463433220982552, "epoch": 0.2272, "grad_norm": 0.0, "learning_rate": 3.2082950205712233e-06, "loss": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.421875, "completions/mean_terminated_length": 228.9906463623047, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0719190426170826, "epoch": 0.22728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2080390371091424e-06, "loss": 0.0, "num_tokens": 126365434.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07300049811601639, "epoch": 0.22736, "grad_norm": 0.0, "learning_rate": 3.207782951599729e-06, "loss": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 190.68235778808594, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.059808628633618355, "epoch": 0.22744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2075267640609054e-06, "loss": 0.0, "num_tokens": 126458186.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06131906062364578, "epoch": 0.22752, "grad_norm": 0.0, "learning_rate": 3.207270474510603e-06, "loss": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.234375, "completions/mean_terminated_length": 221.6049346923828, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.06606466323137283, "epoch": 0.2276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2070140829667585e-06, "loss": 0.0, "num_tokens": 126553704.0, "reward": 0.07554597407579422, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07554597407579422, "rewards/reward_fn/std": 0.20066124200820923, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0673828125, "epoch": 0.22768, "grad_norm": 0.0, "learning_rate": 3.206757589447318e-06, "loss": 0.0, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.390625, "completions/mean_terminated_length": 194.78160095214844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06952721998095512, "epoch": 0.22776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.206500993970232e-06, "loss": 0.0, "num_tokens": 126646682.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07011638209223747, "epoch": 0.22784, "grad_norm": 0.0, "learning_rate": 3.2062442965534603e-06, "loss": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 245.296875, "completions/mean_terminated_length": 237.4864959716797, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06772316992282867, "epoch": 0.22792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2059874972149687e-06, "loss": 0.0, "num_tokens": 126743616.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06499184668064117, "epoch": 0.228, "grad_norm": 0.0, "learning_rate": 3.20573059597273e-06, "loss": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.515625, "completions/mean_terminated_length": 206.14706420898438, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.05958135984838009, "epoch": 0.22808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.205473592844725e-06, "loss": 0.0, "num_tokens": 126838530.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06194867007434368, "epoch": 0.22816, "grad_norm": 0.0, "learning_rate": 3.205216487848942e-06, "loss": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.1484375, "completions/mean_terminated_length": 199.80882263183594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0619149561971426, "epoch": 0.22824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2049592810033744e-06, "loss": 0.0, "num_tokens": 126933013.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06280944868922234, "epoch": 0.22832, "grad_norm": 0.0, "learning_rate": 3.204701972326024e-06, "loss": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.703125, "completions/mean_terminated_length": 228.6865692138672, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06757117062807083, "epoch": 0.2284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.204444561834901e-06, "loss": 0.0, "num_tokens": 127029487.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06800275668501854, "epoch": 0.22848, "grad_norm": 0.0, "learning_rate": 3.2041870495480194e-06, "loss": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.2421875, "completions/mean_terminated_length": 223.35643005371094, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.06599503010511398, "epoch": 0.22856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.203929435483404e-06, "loss": 0.0, "num_tokens": 127124494.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06505580991506577, "epoch": 0.22864, "grad_norm": 0.0, "learning_rate": 3.2036717196590847e-06, "loss": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.2109375, "completions/mean_terminated_length": 201.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06962637603282928, "epoch": 0.22872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.203413902093098e-06, "loss": 0.0, "num_tokens": 127219113.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06833860278129578, "epoch": 0.2288, "grad_norm": 0.0, "learning_rate": 3.20315598280349e-06, "loss": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.875, "completions/mean_terminated_length": 212.18182373046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06630369275808334, "epoch": 0.22888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.202897961808311e-06, "loss": 0.0, "num_tokens": 127313561.0, "reward": 0.76492840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.76492840051651, "rewards/reward_fn/std": 1.296067476272583, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0652034729719162, "epoch": 0.22896, "grad_norm": 0.0, "learning_rate": 3.20263983912562e-06, "loss": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.078125, "completions/mean_terminated_length": 233.60784912109375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06731139868497849, "epoch": 0.22904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.202381614773483e-06, "loss": 0.0, "num_tokens": 127410723.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0706106424331665, "epoch": 0.22912, "grad_norm": 0.0, "learning_rate": 3.2021232887699737e-06, "loss": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.2265625, "completions/mean_terminated_length": 214.61798095703125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06056627444922924, "epoch": 0.2292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2018648611331706e-06, "loss": 0.0, "num_tokens": 127505344.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0632184948772192, "epoch": 0.22928, "grad_norm": 0.0, "learning_rate": 3.2016063318811626e-06, "loss": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.6328125, "completions/mean_terminated_length": 239.45614624023438, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.07604563236236572, "epoch": 0.22936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2013477010320427e-06, "loss": 0.0, "num_tokens": 127602705.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07432325929403305, "epoch": 0.22944, "grad_norm": 0.0, "learning_rate": 3.2010889686039133e-06, "loss": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.296875, "completions/mean_terminated_length": 203.6896514892578, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0637328140437603, "epoch": 0.22952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2008301346148826e-06, "loss": 0.0, "num_tokens": 127697975.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06591970846056938, "epoch": 0.2296, "grad_norm": 0.0, "learning_rate": 3.2005711990830653e-06, "loss": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.171875, "completions/mean_terminated_length": 235.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06841959804296494, "epoch": 0.22968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.2003121620265858e-06, "loss": 0.0, "num_tokens": 127794893.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06974215805530548, "epoch": 0.22976, "grad_norm": 0.0, "learning_rate": 3.200053023463573e-06, "loss": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.703125, "completions/mean_terminated_length": 239.6615447998047, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06582685559988022, "epoch": 0.22984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1997937834121635e-06, "loss": 0.0, "num_tokens": 127892135.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06609766185283661, "epoch": 0.22992, "grad_norm": 0.0, "learning_rate": 3.199534441890502e-06, "loss": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.921875, "completions/mean_terminated_length": 234.28915405273438, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06569397076964378, "epoch": 0.23, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.19927499891674e-06, "loss": 0.0, "num_tokens": 127988637.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06469292938709259, "epoch": 0.23008, "grad_norm": 0.0, "learning_rate": 3.199015454509035e-06, "loss": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.8203125, "completions/mean_terminated_length": 193.7422637939453, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.060462091118097305, "epoch": 0.23016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.198755808685553e-06, "loss": 0.0, "num_tokens": 128080902.0, "reward": 0.4908899664878845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4908899664878845, "rewards/reward_fn/std": 0.9891904592514038, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06406654790043831, "epoch": 0.23024, "grad_norm": 0.0, "learning_rate": 3.198496061464466e-06, "loss": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.9765625, "completions/mean_terminated_length": 241.0465087890625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.0637657605111599, "epoch": 0.23032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1982362128639535e-06, "loss": 0.0, "num_tokens": 128178563.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06148502789437771, "epoch": 0.2304, "grad_norm": 0.0, "learning_rate": 3.1979762629022025e-06, "loss": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.4453125, "completions/mean_terminated_length": 211.73239135742188, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06475673988461494, "epoch": 0.23048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1977162115974063e-06, "loss": 0.0, "num_tokens": 128273724.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06487468630075455, "epoch": 0.23056, "grad_norm": 0.0, "learning_rate": 3.197456058967767e-06, "loss": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.1328125, "completions/mean_terminated_length": 240.0105438232422, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06580494344234467, "epoch": 0.23064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.19719580503149e-06, "loss": 0.0, "num_tokens": 128370509.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06682415679097176, "epoch": 0.23072, "grad_norm": 0.0, "learning_rate": 3.1969354498067934e-06, "loss": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.3515625, "completions/mean_terminated_length": 209.04495239257812, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.0706295482814312, "epoch": 0.2308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.196674993311897e-06, "loss": 0.0, "num_tokens": 128464634.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0697217583656311, "epoch": 0.23088, "grad_norm": 0.0, "learning_rate": 3.1964144355650315e-06, "loss": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.6328125, "completions/mean_terminated_length": 199.38201904296875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07569505274295807, "epoch": 0.23096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.196153776584432e-06, "loss": 0.0, "num_tokens": 128557899.0, "reward": 0.5291802287101746, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5291802287101746, "rewards/reward_fn/std": 0.9920058846473694, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07452671974897385, "epoch": 0.23104, "grad_norm": 0.0, "learning_rate": 3.1958930163883424e-06, "loss": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.1953125, "completions/mean_terminated_length": 216.55406188964844, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06585677340626717, "epoch": 0.23112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1956321549950137e-06, "loss": 0.0, "num_tokens": 128653284.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06853075325489044, "epoch": 0.2312, "grad_norm": 0.0, "learning_rate": 3.1953711924227025e-06, "loss": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.71875, "completions/mean_terminated_length": 244.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.06575236469507217, "epoch": 0.23128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.195110128689674e-06, "loss": 0.0, "num_tokens": 128750784.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06749095022678375, "epoch": 0.23136, "grad_norm": 0.0, "learning_rate": 3.1948489638141996e-06, "loss": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.9375, "completions/mean_terminated_length": 223.23403930664062, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06688553839921951, "epoch": 0.23144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1945876978145586e-06, "loss": 0.0, "num_tokens": 128846008.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06735505908727646, "epoch": 0.23152, "grad_norm": 0.0, "learning_rate": 3.1943263307090357e-06, "loss": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.9140625, "completions/mean_terminated_length": 238.03797912597656, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07080109789967537, "epoch": 0.2316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.194064862515925e-06, "loss": 0.0, "num_tokens": 128942893.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06985266879200935, "epoch": 0.23168, "grad_norm": 0.0, "learning_rate": 3.1938032932535265e-06, "loss": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6328125, "completions/mean_terminated_length": 239.57534790039062, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06240418925881386, "epoch": 0.23176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.193541622940147e-06, "loss": 0.0, "num_tokens": 129039998.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06248636916279793, "epoch": 0.23184, "grad_norm": 0.0, "learning_rate": 3.1932798515941e-06, "loss": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.4375, "completions/mean_terminated_length": 243.0933380126953, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.07087674736976624, "epoch": 0.23192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1930179792337073e-06, "loss": 0.0, "num_tokens": 129137334.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07202357426285744, "epoch": 0.232, "grad_norm": 0.0, "learning_rate": 3.192756005877297e-06, "loss": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.1796875, "completions/mean_terminated_length": 237.79031372070312, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07310844212770462, "epoch": 0.23208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1924939315432057e-06, "loss": 0.0, "num_tokens": 129234509.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07328888028860092, "epoch": 0.23216, "grad_norm": 0.0, "learning_rate": 3.1922317562497736e-06, "loss": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.328125, "completions/mean_terminated_length": 216.84535217285156, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06880494207143784, "epoch": 0.23224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.191969480015352e-06, "loss": 0.0, "num_tokens": 129329015.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06876019388437271, "epoch": 0.23232, "grad_norm": 0.0, "learning_rate": 3.1917071028582972e-06, "loss": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.9609375, "completions/mean_terminated_length": 223.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06549745053052902, "epoch": 0.2324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.191444624796972e-06, "loss": 0.0, "num_tokens": 129423986.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06992459669709206, "epoch": 0.23248, "grad_norm": 0.0, "learning_rate": 3.1911820458497477e-06, "loss": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.484375, "completions/mean_terminated_length": 239.3150634765625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06563360244035721, "epoch": 0.23256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1909193660350023e-06, "loss": 0.0, "num_tokens": 129521072.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06452423706650734, "epoch": 0.23264, "grad_norm": 0.0, "learning_rate": 3.1906565853711196e-06, "loss": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 228.13186645507812, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.06493572145700455, "epoch": 0.23272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1903937038764933e-06, "loss": 0.0, "num_tokens": 129616840.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06485059857368469, "epoch": 0.2328, "grad_norm": 0.0, "learning_rate": 3.19013072156952e-06, "loss": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.234375, "completions/mean_terminated_length": 241.04000854492188, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.07509347423911095, "epoch": 0.23288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1898676384686073e-06, "loss": 0.0, "num_tokens": 129714022.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0727895125746727, "epoch": 0.23296, "grad_norm": 0.0, "learning_rate": 3.1896044545921677e-06, "loss": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.171875, "completions/mean_terminated_length": 231.96826171875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.066795215010643, "epoch": 0.23304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.189341169958621e-06, "loss": 0.0, "num_tokens": 129810812.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06922993808984756, "epoch": 0.23312, "grad_norm": 0.0, "learning_rate": 3.1890777845863954e-06, "loss": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.703125, "completions/mean_terminated_length": 197.6428680419922, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07052041590213776, "epoch": 0.2332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1888142984939247e-06, "loss": 0.0, "num_tokens": 129904214.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06897243112325668, "epoch": 0.23328, "grad_norm": 0.0, "learning_rate": 3.1885507116996487e-06, "loss": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.8671875, "completions/mean_terminated_length": 211.2318878173828, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06385659798979759, "epoch": 0.23336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1882870242220175e-06, "loss": 0.0, "num_tokens": 129999429.0, "reward": 0.0982079803943634, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0982079803943634, "rewards/reward_fn/std": 0.2608548700809479, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06404456868767738, "epoch": 0.23344, "grad_norm": 0.0, "learning_rate": 3.188023236079486e-06, "loss": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.3671875, "completions/mean_terminated_length": 234.22093200683594, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07420437783002853, "epoch": 0.23352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1877593472905156e-06, "loss": 0.0, "num_tokens": 130095860.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.071338951587677, "epoch": 0.2336, "grad_norm": 0.0, "learning_rate": 3.187495357873577e-06, "loss": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 251.1875, "completions/mean_terminated_length": 240.97560119628906, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06603925675153732, "epoch": 0.23368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1872312678471462e-06, "loss": 0.0, "num_tokens": 130193548.0, "reward": 0.03868836537003517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03868836537003517, "rewards/reward_fn/std": 0.10276199877262115, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06529200077056885, "epoch": 0.23376, "grad_norm": 0.0, "learning_rate": 3.186967077229707e-06, "loss": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.8359375, "completions/mean_terminated_length": 213.5321044921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06430870294570923, "epoch": 0.23384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.186702786039749e-06, "loss": 0.0, "num_tokens": 130287223.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06327813491225243, "epoch": 0.23392, "grad_norm": 0.0, "learning_rate": 3.1864383942957706e-06, "loss": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 231.3563232421875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.06727752089500427, "epoch": 0.234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1861739020162762e-06, "loss": 0.0, "num_tokens": 130383383.0, "reward": 0.10344578325748444, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10344578325748444, "rewards/reward_fn/std": 0.2672600746154785, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07014705240726471, "epoch": 0.23408, "grad_norm": 0.0, "learning_rate": 3.1859093092197776e-06, "loss": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.96875, "completions/mean_terminated_length": 211.02127075195312, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07671946659684181, "epoch": 0.23416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1856446159247937e-06, "loss": 0.0, "num_tokens": 130477459.0, "reward": 0.0688910037279129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0688910037279129, "rewards/reward_fn/std": 0.18298465013504028, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07532909139990807, "epoch": 0.23424, "grad_norm": 0.0, "learning_rate": 3.1853798221498498e-06, "loss": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.46875, "completions/mean_terminated_length": 222.07058715820312, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06383149698376656, "epoch": 0.23432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.185114927913479e-06, "loss": 0.0, "num_tokens": 130572879.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06577713415026665, "epoch": 0.2344, "grad_norm": 0.0, "learning_rate": 3.1848499332342204e-06, "loss": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.734375, "completions/mean_terminated_length": 227.83131408691406, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06141666695475578, "epoch": 0.23448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1845848381306216e-06, "loss": 0.0, "num_tokens": 130668845.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06506644189357758, "epoch": 0.23456, "grad_norm": 0.0, "learning_rate": 3.1843196426212365e-06, "loss": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.8671875, "completions/mean_terminated_length": 227.5194854736328, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07598307728767395, "epoch": 0.23464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1840543467246256e-06, "loss": 0.0, "num_tokens": 130764956.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07918303087353706, "epoch": 0.23472, "grad_norm": 0.0, "learning_rate": 3.1837889504593564e-06, "loss": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.390625, "completions/mean_terminated_length": 209.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.0755259320139885, "epoch": 0.2348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1835234538440047e-06, "loss": 0.0, "num_tokens": 130859726.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0751158632338047, "epoch": 0.23488, "grad_norm": 0.0, "learning_rate": 3.183257856897152e-06, "loss": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 243.921875, "completions/mean_terminated_length": 234.82191467285156, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06702468916773796, "epoch": 0.23496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.182992159637388e-06, "loss": 0.0, "num_tokens": 130956484.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06829294189810753, "epoch": 0.23504, "grad_norm": 0.0, "learning_rate": 3.182726362083307e-06, "loss": 0.0, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.0078125, "completions/mean_terminated_length": 218.44566345214844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06999526545405388, "epoch": 0.23512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.182460464253514e-06, "loss": 0.0, "num_tokens": 131051333.0, "reward": 0.4792068302631378, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4792068302631378, "rewards/reward_fn/std": 0.9949710965156555, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07024548575282097, "epoch": 0.2352, "grad_norm": 0.0, "learning_rate": 3.182194466166618e-06, "loss": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.2890625, "completions/mean_terminated_length": 217.42681884765625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06844403967261314, "epoch": 0.23528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.181928367841236e-06, "loss": 0.0, "num_tokens": 131146474.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07411892712116241, "epoch": 0.23536, "grad_norm": 0.0, "learning_rate": 3.181662169295992e-06, "loss": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.4921875, "completions/mean_terminated_length": 228.99998474121094, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.0701812207698822, "epoch": 0.23544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.181395870549518e-06, "loss": 0.0, "num_tokens": 131242537.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07252568751573563, "epoch": 0.23552, "grad_norm": 0.0, "learning_rate": 3.181129471620451e-06, "loss": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.21875, "completions/mean_terminated_length": 226.68292236328125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.08128058165311813, "epoch": 0.2356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1808629725274362e-06, "loss": 0.0, "num_tokens": 131338437.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08007887750864029, "epoch": 0.23568, "grad_norm": 0.0, "learning_rate": 3.1805963732891266e-06, "loss": 0.0, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.9921875, "completions/mean_terminated_length": 236.2948760986328, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06963237002491951, "epoch": 0.23576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.180329673924181e-06, "loss": 0.0, "num_tokens": 131435204.0, "reward": 0.05776464566588402, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05776464566588402, "rewards/reward_fn/std": 0.15343140065670013, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07468575239181519, "epoch": 0.23584, "grad_norm": 0.0, "learning_rate": 3.180062874451265e-06, "loss": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.8828125, "completions/mean_terminated_length": 234.74118041992188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.0705736018717289, "epoch": 0.23592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.179795974889052e-06, "loss": 0.0, "num_tokens": 131531701.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07020483165979385, "epoch": 0.236, "grad_norm": 0.0, "learning_rate": 3.1795289752562228e-06, "loss": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.21875, "completions/mean_terminated_length": 211.3333282470703, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06788726523518562, "epoch": 0.23608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.179261875571464e-06, "loss": 0.0, "num_tokens": 131624913.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06593415141105652, "epoch": 0.23616, "grad_norm": 0.0, "learning_rate": 3.1789946758534694e-06, "loss": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.890625, "completions/mean_terminated_length": 222.64198303222656, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.0717763677239418, "epoch": 0.23624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1787273761209406e-06, "loss": 0.0, "num_tokens": 131720515.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07348769530653954, "epoch": 0.23632, "grad_norm": 0.0, "learning_rate": 3.178459976392586e-06, "loss": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.640625, "completions/mean_terminated_length": 216.02565002441406, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06478064879775047, "epoch": 0.2364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1781924766871196e-06, "loss": 0.0, "num_tokens": 131815701.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07155433297157288, "epoch": 0.23648, "grad_norm": 0.0, "learning_rate": 3.1779248770232655e-06, "loss": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.34375, "completions/mean_terminated_length": 219.72413635253906, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06396935507655144, "epoch": 0.23656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.177657177419751e-06, "loss": 0.0, "num_tokens": 131910849.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06740273535251617, "epoch": 0.23664, "grad_norm": 0.0, "learning_rate": 3.1773893778953127e-06, "loss": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.6796875, "completions/mean_terminated_length": 212.03091430664062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0731445662677288, "epoch": 0.23672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1771214784686953e-06, "loss": 0.0, "num_tokens": 132004888.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0716388002038002, "epoch": 0.2368, "grad_norm": 0.0, "learning_rate": 3.1768534791586462e-06, "loss": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.2734375, "completions/mean_terminated_length": 207.38356018066406, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06739775836467743, "epoch": 0.23688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.176585379983925e-06, "loss": 0.0, "num_tokens": 132099643.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07548385113477707, "epoch": 0.23696, "grad_norm": 0.0, "learning_rate": 3.1763171809632946e-06, "loss": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.2109375, "completions/mean_terminated_length": 236.60440063476562, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06619386374950409, "epoch": 0.23704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.176048882115526e-06, "loss": 0.0, "num_tokens": 132196182.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06369108706712723, "epoch": 0.23712, "grad_norm": 0.0, "learning_rate": 3.175780483459398e-06, "loss": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.59375, "completions/mean_terminated_length": 210.44155883789062, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07571519911289215, "epoch": 0.2372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1755119850136945e-06, "loss": 0.0, "num_tokens": 132290978.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07396650314331055, "epoch": 0.23728, "grad_norm": 0.0, "learning_rate": 3.175243386797209e-06, "loss": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.328125, "completions/mean_terminated_length": 219.8035888671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.061643004417419434, "epoch": 0.23736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1749746888287393e-06, "loss": 0.0, "num_tokens": 132385228.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06082381308078766, "epoch": 0.23744, "grad_norm": 0.0, "learning_rate": 3.1747058911270924e-06, "loss": 0.0, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.0859375, "completions/mean_terminated_length": 243.876708984375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0637814961373806, "epoch": 0.23752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1744369937110807e-06, "loss": 0.0, "num_tokens": 132482647.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06580029055476189, "epoch": 0.2376, "grad_norm": 0.0, "learning_rate": 3.1741679965995244e-06, "loss": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.2890625, "completions/mean_terminated_length": 236.71429443359375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06502771005034447, "epoch": 0.23768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1738988998112498e-06, "loss": 0.0, "num_tokens": 132579196.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06462564691901207, "epoch": 0.23776, "grad_norm": 0.0, "learning_rate": 3.1736297033650926e-06, "loss": 0.0, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.2734375, "completions/mean_terminated_length": 237.93421936035156, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07105622068047523, "epoch": 0.23784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.173360407279892e-06, "loss": 0.0, "num_tokens": 132676127.0, "reward": 0.12251204997301102, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12251204997301102, "rewards/reward_fn/std": 0.32541003823280334, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102350890636444, "epoch": 0.23792, "grad_norm": 0.0, "learning_rate": 3.173091011574496e-06, "loss": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.9140625, "completions/mean_terminated_length": 203.25555419921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.0770421251654625, "epoch": 0.238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1728215162677603e-06, "loss": 0.0, "num_tokens": 132769684.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06912631914019585, "epoch": 0.23808, "grad_norm": 0.0, "learning_rate": 3.172551921378546e-06, "loss": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.8828125, "completions/mean_terminated_length": 207.09524536132812, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07252445444464684, "epoch": 0.23816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1722822269257225e-06, "loss": 0.0, "num_tokens": 132862853.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07017023116350174, "epoch": 0.23824, "grad_norm": 0.0, "learning_rate": 3.1720124329281652e-06, "loss": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.4453125, "completions/mean_terminated_length": 191.2446746826172, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07039409130811691, "epoch": 0.23832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1717425394047563e-06, "loss": 0.0, "num_tokens": 132955070.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0730079934000969, "epoch": 0.2384, "grad_norm": 0.0, "learning_rate": 3.171472546374387e-06, "loss": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.8046875, "completions/mean_terminated_length": 240.51315307617188, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.07443159446120262, "epoch": 0.23848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1712024538559523e-06, "loss": 0.0, "num_tokens": 133052197.0, "reward": 0.06533318012952805, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06533318012952805, "rewards/reward_fn/std": 0.1735345423221588, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07823509350419044, "epoch": 0.23856, "grad_norm": 0.0, "learning_rate": 3.1709322618683575e-06, "loss": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.9609375, "completions/mean_terminated_length": 202.31082153320312, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07132704555988312, "epoch": 0.23864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1706619704305116e-06, "loss": 0.0, "num_tokens": 133146528.0, "reward": 0.49818843603134155, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49818843603134155, "rewards/reward_fn/std": 1.0030310153961182, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06868357211351395, "epoch": 0.23872, "grad_norm": 0.0, "learning_rate": 3.1703915795613323e-06, "loss": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.0390625, "completions/mean_terminated_length": 211.0946044921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06442562490701675, "epoch": 0.2388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1701210892797454e-06, "loss": 0.0, "num_tokens": 133241509.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06386387348175049, "epoch": 0.23888, "grad_norm": 0.0, "learning_rate": 3.169850499604681e-06, "loss": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.4609375, "completions/mean_terminated_length": 219.6777801513672, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06848632171750069, "epoch": 0.23896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1695798105550793e-06, "loss": 0.0, "num_tokens": 133336544.0, "reward": 0.4438909888267517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4438909888267517, "rewards/reward_fn/std": 0.9866783618927002, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06589771062135696, "epoch": 0.23904, "grad_norm": 0.0, "learning_rate": 3.169309022149883e-06, "loss": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.6875, "completions/mean_terminated_length": 203.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0691191628575325, "epoch": 0.23912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1690381344080464e-06, "loss": 0.0, "num_tokens": 133431480.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06660557724535465, "epoch": 0.2392, "grad_norm": 0.0, "learning_rate": 3.1687671473485286e-06, "loss": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.9453125, "completions/mean_terminated_length": 236.1999969482422, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.0679863840341568, "epoch": 0.23928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1684960609902945e-06, "loss": 0.0, "num_tokens": 133528497.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06784609705209732, "epoch": 0.23936, "grad_norm": 0.0, "learning_rate": 3.1682248753523197e-06, "loss": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.1953125, "completions/mean_terminated_length": 211.840576171875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06728869676589966, "epoch": 0.23944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.167953590453582e-06, "loss": 0.0, "num_tokens": 133623754.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06799356639385223, "epoch": 0.23952, "grad_norm": 0.0, "learning_rate": 3.1676822063130695e-06, "loss": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 232.3116912841797, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06911305338144302, "epoch": 0.2396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1674107229497764e-06, "loss": 0.0, "num_tokens": 133720234.0, "reward": 0.4979593753814697, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4979593753814697, "rewards/reward_fn/std": 1.00291907787323, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06902730092406273, "epoch": 0.23968, "grad_norm": 0.0, "learning_rate": 3.1671391403827033e-06, "loss": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.703125, "completions/mean_terminated_length": 234.4835205078125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06716218590736389, "epoch": 0.23976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.166867458630858e-06, "loss": 0.0, "num_tokens": 133816580.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06874871999025345, "epoch": 0.23984, "grad_norm": 0.0, "learning_rate": 3.1665956777132553e-06, "loss": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3359375, "completions/mean_terminated_length": 216.53750610351562, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06602967530488968, "epoch": 0.23992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1663237976489173e-06, "loss": 0.0, "num_tokens": 133911727.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06998446583747864, "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 3.1660518184568735e-06, "loss": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.1484375, "completions/mean_terminated_length": 223.83544921875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07316945865750313, "epoch": 0.24008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.165779740156158e-06, "loss": 0.0, "num_tokens": 134007490.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07174872234463692, "epoch": 0.24016, "grad_norm": 0.0, "learning_rate": 3.1655075627658145e-06, "loss": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.046875, "completions/mean_terminated_length": 236.57627868652344, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07866664975881577, "epoch": 0.24024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1652352863048917e-06, "loss": 0.0, "num_tokens": 134104648.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07794361934065819, "epoch": 0.24032, "grad_norm": 0.0, "learning_rate": 3.164962910792447e-06, "loss": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.828125, "completions/mean_terminated_length": 220.59341430664062, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06912068277597427, "epoch": 0.2404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1646904362475433e-06, "loss": 0.0, "num_tokens": 134199730.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07101325318217278, "epoch": 0.24048, "grad_norm": 0.0, "learning_rate": 3.164417862689251e-06, "loss": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.5234375, "completions/mean_terminated_length": 218.49038696289062, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07048685848712921, "epoch": 0.24056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1641451901366473e-06, "loss": 0.0, "num_tokens": 134294133.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06972230225801468, "epoch": 0.24064, "grad_norm": 0.0, "learning_rate": 3.163872418608817e-06, "loss": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.5390625, "completions/mean_terminated_length": 224.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06536096706986427, "epoch": 0.24072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.16359954812485e-06, "loss": 0.0, "num_tokens": 134389562.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06418158859014511, "epoch": 0.2408, "grad_norm": 0.0, "learning_rate": 3.1633265787038453e-06, "loss": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.8671875, "completions/mean_terminated_length": 239.58229064941406, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.0726749636232853, "epoch": 0.24088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1630535103649077e-06, "loss": 0.0, "num_tokens": 134486569.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07297356426715851, "epoch": 0.24096, "grad_norm": 0.0, "learning_rate": 3.1627803431271485e-06, "loss": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 236.421875, "completions/mean_terminated_length": 221.19444274902344, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06447308510541916, "epoch": 0.24104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.162507077009688e-06, "loss": 0.0, "num_tokens": 134582367.0, "reward": 0.11520856618881226, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11520856618881226, "rewards/reward_fn/std": 0.306010901927948, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06711403280496597, "epoch": 0.24112, "grad_norm": 0.0, "learning_rate": 3.1622337120316505e-06, "loss": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.0546875, "completions/mean_terminated_length": 219.25758361816406, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06708282232284546, "epoch": 0.2412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1619602482121692e-06, "loss": 0.0, "num_tokens": 134678246.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06724167987704277, "epoch": 0.24128, "grad_norm": 0.0, "learning_rate": 3.1616866855703835e-06, "loss": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.796875, "completions/mean_terminated_length": 235.8000030517578, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07361358031630516, "epoch": 0.24136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1614130241254407e-06, "loss": 0.0, "num_tokens": 134774732.0, "reward": 0.12424539774656296, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12424539774656296, "rewards/reward_fn/std": 0.33001407980918884, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143723964691162, "epoch": 0.24144, "grad_norm": 0.0, "learning_rate": 3.1611392638964924e-06, "loss": 0.0, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.015625, "completions/mean_terminated_length": 215.4390106201172, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07827022671699524, "epoch": 0.24152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.160865404902701e-06, "loss": 0.0, "num_tokens": 134869710.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0775766409933567, "epoch": 0.2416, "grad_norm": 0.0, "learning_rate": 3.1605914471632324e-06, "loss": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.9765625, "completions/mean_terminated_length": 209.69879150390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.0662425309419632, "epoch": 0.24168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1603173906972617e-06, "loss": 0.0, "num_tokens": 134964171.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06674159318208694, "epoch": 0.24176, "grad_norm": 0.0, "learning_rate": 3.1600432355239687e-06, "loss": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.5078125, "completions/mean_terminated_length": 214.60784912109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.061412397772073746, "epoch": 0.24184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1597689816625422e-06, "loss": 0.0, "num_tokens": 135060364.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06215192377567291, "epoch": 0.24192, "grad_norm": 0.0, "learning_rate": 3.1594946291321773e-06, "loss": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.8046875, "completions/mean_terminated_length": 218.31944274902344, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06367229297757149, "epoch": 0.242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.159220177952075e-06, "loss": 0.0, "num_tokens": 135155955.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06259936094284058, "epoch": 0.24208, "grad_norm": 0.0, "learning_rate": 3.1589456281414455e-06, "loss": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 205.65625, "completions/mean_terminated_length": 188.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06987767666578293, "epoch": 0.24216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.158670979719502e-06, "loss": 0.0, "num_tokens": 135247815.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.066818967461586, "epoch": 0.24224, "grad_norm": 0.0, "learning_rate": 3.1583962327054688e-06, "loss": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 198.515625, "completions/mean_terminated_length": 176.8817138671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07282597944140434, "epoch": 0.24232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1581213871185754e-06, "loss": 0.0, "num_tokens": 135338761.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07473631575703621, "epoch": 0.2424, "grad_norm": 0.0, "learning_rate": 3.1578464429780564e-06, "loss": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.2421875, "completions/mean_terminated_length": 211.188232421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07140122354030609, "epoch": 0.24248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1575714003031566e-06, "loss": 0.0, "num_tokens": 135433256.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07563192397356033, "epoch": 0.24256, "grad_norm": 0.0, "learning_rate": 3.157296259113126e-06, "loss": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.03125, "completions/mean_terminated_length": 233.9770050048828, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07321467250585556, "epoch": 0.24264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1570210194272203e-06, "loss": 0.0, "num_tokens": 135529644.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07070863991975784, "epoch": 0.24272, "grad_norm": 0.0, "learning_rate": 3.1567456812647044e-06, "loss": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5390625, "completions/mean_terminated_length": 214.37879943847656, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06746497750282288, "epoch": 0.2428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.156470244644849e-06, "loss": 0.0, "num_tokens": 135625201.0, "reward": 0.07393992692232132, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07393992692232132, "rewards/reward_fn/std": 0.19639533758163452, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06867868825793266, "epoch": 0.24288, "grad_norm": 0.0, "learning_rate": 3.1561947095869317e-06, "loss": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.5078125, "completions/mean_terminated_length": 212.4933319091797, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06300869211554527, "epoch": 0.24296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1559190761102366e-06, "loss": 0.0, "num_tokens": 135720242.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06358353421092033, "epoch": 0.24304, "grad_norm": 0.0, "learning_rate": 3.1556433442340555e-06, "loss": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.203125, "completions/mean_terminated_length": 190.09194946289062, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07069270685315132, "epoch": 0.24312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.155367513977687e-06, "loss": 0.0, "num_tokens": 135812812.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07007990777492523, "epoch": 0.2432, "grad_norm": 0.0, "learning_rate": 3.1550915853604358e-06, "loss": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.0078125, "completions/mean_terminated_length": 211.92222595214844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07045679911971092, "epoch": 0.24328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1548155584016145e-06, "loss": 0.0, "num_tokens": 135907149.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07274733856320381, "epoch": 0.24336, "grad_norm": 0.0, "learning_rate": 3.154539433120542e-06, "loss": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.578125, "completions/mean_terminated_length": 237.49368286132812, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.07152510806918144, "epoch": 0.24344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1542632095365436e-06, "loss": 0.0, "num_tokens": 136003991.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07179131731390953, "epoch": 0.24352, "grad_norm": 0.0, "learning_rate": 3.153986887668952e-06, "loss": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.984375, "completions/mean_terminated_length": 211.5277862548828, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0665157102048397, "epoch": 0.2436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1537104675371073e-06, "loss": 0.0, "num_tokens": 136099093.0, "reward": 0.46072614192962646, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46072614192962646, "rewards/reward_fn/std": 0.9895316362380981, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06500022485852242, "epoch": 0.24368, "grad_norm": 0.0, "learning_rate": 3.153433949160356e-06, "loss": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.0390625, "completions/mean_terminated_length": 213.84042358398438, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07067714631557465, "epoch": 0.24376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.153157332558051e-06, "loss": 0.0, "num_tokens": 136193434.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06851517409086227, "epoch": 0.24384, "grad_norm": 0.0, "learning_rate": 3.1528806177495532e-06, "loss": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.03125, "completions/mean_terminated_length": 217.13726806640625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07516857981681824, "epoch": 0.24392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.15260380475423e-06, "loss": 0.0, "num_tokens": 136287774.0, "reward": 0.5375781655311584, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5375781655311584, "rewards/reward_fn/std": 0.9745171070098877, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0771709494292736, "epoch": 0.244, "grad_norm": 0.0, "learning_rate": 3.152326893591454e-06, "loss": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.2890625, "completions/mean_terminated_length": 225.5287322998047, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06704743206501007, "epoch": 0.24408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1520498842806065e-06, "loss": 0.0, "num_tokens": 136383427.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06905126571655273, "epoch": 0.24416, "grad_norm": 0.0, "learning_rate": 3.151772776841076e-06, "loss": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.5703125, "completions/mean_terminated_length": 216.4666748046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07546891644597054, "epoch": 0.24424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.151495571292257e-06, "loss": 0.0, "num_tokens": 136477580.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07427883520722389, "epoch": 0.24432, "grad_norm": 0.0, "learning_rate": 3.1512182676535497e-06, "loss": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5859375, "completions/mean_terminated_length": 241.242431640625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.06757279112935066, "epoch": 0.2444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.150940865944364e-06, "loss": 0.0, "num_tokens": 136574423.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06635932251811028, "epoch": 0.24448, "grad_norm": 0.0, "learning_rate": 3.150663366184114e-06, "loss": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.3203125, "completions/mean_terminated_length": 234.64892578125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06491170637309551, "epoch": 0.24456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1503857683922224e-06, "loss": 0.0, "num_tokens": 136670720.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06539461016654968, "epoch": 0.24464, "grad_norm": 0.0, "learning_rate": 3.150108072588118e-06, "loss": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.7109375, "completions/mean_terminated_length": 194.4603271484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06787820905447006, "epoch": 0.24472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1498302787912366e-06, "loss": 0.0, "num_tokens": 136765147.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07051464915275574, "epoch": 0.2448, "grad_norm": 0.0, "learning_rate": 3.14955238702102e-06, "loss": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.53125, "completions/mean_terminated_length": 220.90139770507812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.08032146841287613, "epoch": 0.24488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1492743972969188e-06, "loss": 0.0, "num_tokens": 136860959.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07604663446545601, "epoch": 0.24496, "grad_norm": 0.0, "learning_rate": 3.148996309638389e-06, "loss": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.7890625, "completions/mean_terminated_length": 239.6931915283203, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06850485503673553, "epoch": 0.24504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.148718124064894e-06, "loss": 0.0, "num_tokens": 136957828.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06796128675341606, "epoch": 0.24512, "grad_norm": 0.0, "learning_rate": 3.1484398405959032e-06, "loss": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.359375, "completions/mean_terminated_length": 221.80247497558594, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0746377632021904, "epoch": 0.2452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1481614592508935e-06, "loss": 0.0, "num_tokens": 137053362.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07127923145890236, "epoch": 0.24528, "grad_norm": 0.0, "learning_rate": 3.14788298004935e-06, "loss": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.7734375, "completions/mean_terminated_length": 226.6290283203125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06559150293469429, "epoch": 0.24536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1476044030107617e-06, "loss": 0.0, "num_tokens": 137149845.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06774462386965752, "epoch": 0.24544, "grad_norm": 0.0, "learning_rate": 3.1473257281546268e-06, "loss": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.8125, "completions/mean_terminated_length": 223.71429443359375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06286091171205044, "epoch": 0.24552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1470469555004494e-06, "loss": 0.0, "num_tokens": 137245437.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06272714957594872, "epoch": 0.2456, "grad_norm": 0.0, "learning_rate": 3.1467680850677404e-06, "loss": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.296875, "completions/mean_terminated_length": 220.5399932861328, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06993354111909866, "epoch": 0.24568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1464891168760186e-06, "loss": 0.0, "num_tokens": 137340195.0, "reward": 0.047493621706962585, "reward_std": 0.0, "rewards/reward_fn/mean": 0.047493621706962585, "rewards/reward_fn/std": 0.12615004181861877, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06763816624879837, "epoch": 0.24576, "grad_norm": 0.0, "learning_rate": 3.146210050944808e-06, "loss": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.765625, "completions/mean_terminated_length": 194.45652770996094, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07298893854022026, "epoch": 0.24584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1459308872936406e-06, "loss": 0.0, "num_tokens": 137432837.0, "reward": 0.8286431431770325, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8286431431770325, "rewards/reward_fn/std": 1.2749619483947754, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07232144102454185, "epoch": 0.24592, "grad_norm": 0.0, "learning_rate": 3.145651625942055e-06, "loss": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 233.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06608591973781586, "epoch": 0.246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.145372266909597e-06, "loss": 0.0, "num_tokens": 137529685.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06888534501194954, "epoch": 0.24608, "grad_norm": 0.0, "learning_rate": 3.1450928102158178e-06, "loss": 0.0, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.3203125, "completions/mean_terminated_length": 240.14515686035156, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07019209489226341, "epoch": 0.24616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1448132558802767e-06, "loss": 0.0, "num_tokens": 137627006.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045405358076096, "epoch": 0.24624, "grad_norm": 0.0, "learning_rate": 3.1445336039225397e-06, "loss": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.203125, "completions/mean_terminated_length": 194.7115478515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06982123851776123, "epoch": 0.24632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1442538543621797e-06, "loss": 0.0, "num_tokens": 137718936.0, "reward": 0.2266714870929718, "reward_std": 0.0, "rewards/reward_fn/mean": 0.2266714870929718, "rewards/reward_fn/std": 0.39648687839508057, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06955017894506454, "epoch": 0.2464, "grad_norm": 0.0, "learning_rate": 3.143974007218776e-06, "loss": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.890625, "completions/mean_terminated_length": 235.6052703857422, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06824186816811562, "epoch": 0.24648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.143694062511915e-06, "loss": 0.0, "num_tokens": 137815690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07170426845550537, "epoch": 0.24656, "grad_norm": 0.0, "learning_rate": 3.1434140202611893e-06, "loss": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.6640625, "completions/mean_terminated_length": 208.71621704101562, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06856831535696983, "epoch": 0.24664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1431338804862e-06, "loss": 0.0, "num_tokens": 137910495.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07573933526873589, "epoch": 0.24672, "grad_norm": 0.0, "learning_rate": 3.142853643206553e-06, "loss": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.28125, "completions/mean_terminated_length": 230.8000030517578, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07133791968226433, "epoch": 0.2468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1425733084418627e-06, "loss": 0.0, "num_tokens": 138006531.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06751140207052231, "epoch": 0.24688, "grad_norm": 0.0, "learning_rate": 3.1422928762117485e-06, "loss": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.46875, "completions/mean_terminated_length": 227.43478393554688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07045537233352661, "epoch": 0.24696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.142012346535839e-06, "loss": 0.0, "num_tokens": 138102207.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07092127203941345, "epoch": 0.24704, "grad_norm": 0.0, "learning_rate": 3.141731719433767e-06, "loss": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 232.58535766601562, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06646791473031044, "epoch": 0.24712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.141450994925175e-06, "loss": 0.0, "num_tokens": 138198591.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06593326479196548, "epoch": 0.2472, "grad_norm": 0.0, "learning_rate": 3.141170173029709e-06, "loss": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.359375, "completions/mean_terminated_length": 223.80850219726562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.0731469988822937, "epoch": 0.24728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1408892537670245e-06, "loss": 0.0, "num_tokens": 138293869.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07165465503931046, "epoch": 0.24736, "grad_norm": 0.0, "learning_rate": 3.1406082371567834e-06, "loss": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.4765625, "completions/mean_terminated_length": 208.5294189453125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07355150952935219, "epoch": 0.24744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.140327123218653e-06, "loss": 0.0, "num_tokens": 138388138.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06700769811868668, "epoch": 0.24752, "grad_norm": 0.0, "learning_rate": 3.1400459119723085e-06, "loss": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.09375, "completions/mean_terminated_length": 208.46807861328125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06665850058197975, "epoch": 0.2476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.139764603437432e-06, "loss": 0.0, "num_tokens": 138481974.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06751720234751701, "epoch": 0.24768, "grad_norm": 0.0, "learning_rate": 3.139483197633712e-06, "loss": 0.0, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.4609375, "completions/mean_terminated_length": 236.12725830078125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06565060839056969, "epoch": 0.24776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.139201694580844e-06, "loss": 0.0, "num_tokens": 138579185.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.6640368103981018, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06825168430805206, "epoch": 0.24784, "grad_norm": 0.0, "learning_rate": 3.138920094298529e-06, "loss": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 236.55555725097656, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.0700690858066082, "epoch": 0.24792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1386383968064785e-06, "loss": 0.0, "num_tokens": 138676089.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06998327746987343, "epoch": 0.248, "grad_norm": 0.0, "learning_rate": 3.1383566021244065e-06, "loss": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.1796875, "completions/mean_terminated_length": 211.67416381835938, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.0758439302444458, "epoch": 0.24808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1380747102720365e-06, "loss": 0.0, "num_tokens": 138770448.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07350689172744751, "epoch": 0.24816, "grad_norm": 0.0, "learning_rate": 3.137792721269097e-06, "loss": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.5703125, "completions/mean_terminated_length": 215.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06675032153725624, "epoch": 0.24824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1375106351353255e-06, "loss": 0.0, "num_tokens": 138865497.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06689813360571861, "epoch": 0.24832, "grad_norm": 0.0, "learning_rate": 3.137228451890464e-06, "loss": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.6484375, "completions/mean_terminated_length": 222.44285583496094, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.07136191427707672, "epoch": 0.2484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1369461715542626e-06, "loss": 0.0, "num_tokens": 138961452.0, "reward": 0.07061244547367096, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07061244547367096, "rewards/reward_fn/std": 0.18755705654621124, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06898050382733345, "epoch": 0.24848, "grad_norm": 0.0, "learning_rate": 3.136663794146479e-06, "loss": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.109375, "completions/mean_terminated_length": 185.22891235351562, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.08529619500041008, "epoch": 0.24856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1363813196868745e-06, "loss": 0.0, "num_tokens": 139053882.0, "reward": 0.7770647406578064, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7770647406578064, "rewards/reward_fn/std": 1.2903636693954468, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07979326322674751, "epoch": 0.24864, "grad_norm": 0.0, "learning_rate": 3.1360987481952212e-06, "loss": 0.0, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 211.67999267578125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06780978664755821, "epoch": 0.24872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.135816079691296e-06, "loss": 0.0, "num_tokens": 139147754.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06933337450027466, "epoch": 0.2488, "grad_norm": 0.0, "learning_rate": 3.1355333141948813e-06, "loss": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.671875, "completions/mean_terminated_length": 220.5454559326172, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06661269068717957, "epoch": 0.24888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.135250451725769e-06, "loss": 0.0, "num_tokens": 139243328.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07066836953163147, "epoch": 0.24896, "grad_norm": 0.0, "learning_rate": 3.134967492303756e-06, "loss": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.9140625, "completions/mean_terminated_length": 229.18055725097656, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07107345387339592, "epoch": 0.24904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1346844359486466e-06, "loss": 0.0, "num_tokens": 139339701.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07072422653436661, "epoch": 0.24912, "grad_norm": 0.0, "learning_rate": 3.134401282680252e-06, "loss": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.6640625, "completions/mean_terminated_length": 232.59677124023438, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.07601338624954224, "epoch": 0.2492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1341180325183896e-06, "loss": 0.0, "num_tokens": 139436554.0, "reward": 0.4021715819835663, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4021715819835663, "rewards/reward_fn/std": 0.9878264665603638, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0778847262263298, "epoch": 0.24928, "grad_norm": 0.0, "learning_rate": 3.133834685482884e-06, "loss": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.1953125, "completions/mean_terminated_length": 239.39561462402344, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.0664495900273323, "epoch": 0.24936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.133551241593566e-06, "loss": 0.0, "num_tokens": 139533347.0, "reward": 0.39967191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.39967191219329834, "rewards/reward_fn/std": 0.988822877407074, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06811915710568428, "epoch": 0.24944, "grad_norm": 0.0, "learning_rate": 3.133267700870274e-06, "loss": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.6796875, "completions/mean_terminated_length": 222.1630401611328, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07181805372238159, "epoch": 0.24952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.132984063332854e-06, "loss": 0.0, "num_tokens": 139628538.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06929938495159149, "epoch": 0.2496, "grad_norm": 0.0, "learning_rate": 3.132700329001156e-06, "loss": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 237.5757598876953, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.06517860293388367, "epoch": 0.24968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1324164978950393e-06, "loss": 0.0, "num_tokens": 139725626.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06412097066640854, "epoch": 0.24976, "grad_norm": 0.0, "learning_rate": 3.1321325700343686e-06, "loss": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.8046875, "completions/mean_terminated_length": 235.5263214111328, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07240090146660805, "epoch": 0.24984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.131848545439016e-06, "loss": 0.0, "num_tokens": 139821985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06811481341719627, "epoch": 0.24992, "grad_norm": 0.0, "learning_rate": 3.1315644241288607e-06, "loss": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.703125, "completions/mean_terminated_length": 224.920654296875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07287808507680893, "epoch": 0.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1312802061237875e-06, "loss": 0.0, "num_tokens": 139918331.0, "reward": 0.12031254917383194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12031254917383194, "rewards/reward_fn/std": 0.3120490610599518, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07060377299785614, "epoch": 0.25008, "grad_norm": 0.0, "learning_rate": 3.1309958914436888e-06, "loss": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.96875, "completions/mean_terminated_length": 242.8571319580078, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0690285973250866, "epoch": 0.25016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.130711480108464e-06, "loss": 0.0, "num_tokens": 140015991.0, "reward": 0.051705554127693176, "reward_std": 0.0, "rewards/reward_fn/mean": 0.051705554127693176, "rewards/reward_fn/std": 0.1373375654220581, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07074926048517227, "epoch": 0.25024, "grad_norm": 0.0, "learning_rate": 3.1304269721380182e-06, "loss": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.59375, "completions/mean_terminated_length": 231.08108520507812, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.0696297213435173, "epoch": 0.25032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.130142367552265e-06, "loss": 0.0, "num_tokens": 140112451.0, "reward": 0.12208539247512817, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12208539247512817, "rewards/reward_fn/std": 0.32427677512168884, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07299041748046875, "epoch": 0.2504, "grad_norm": 0.0, "learning_rate": 3.129857666371123e-06, "loss": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.0390625, "completions/mean_terminated_length": 241.44285583496094, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06847302988171577, "epoch": 0.25048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.129572868614519e-06, "loss": 0.0, "num_tokens": 140209736.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06731395795941353, "epoch": 0.25056, "grad_norm": 0.0, "learning_rate": 3.129287974302384e-06, "loss": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.7890625, "completions/mean_terminated_length": 213.94737243652344, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06232259422540665, "epoch": 0.25064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1290029834546597e-06, "loss": 0.0, "num_tokens": 140304045.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0630816575139761, "epoch": 0.25072, "grad_norm": 0.0, "learning_rate": 3.128717896091291e-06, "loss": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.8359375, "completions/mean_terminated_length": 237.6794891357422, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06032097712159157, "epoch": 0.2508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.128432712232232e-06, "loss": 0.0, "num_tokens": 140400920.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05608532950282097, "epoch": 0.25088, "grad_norm": 0.0, "learning_rate": 3.128147431897442e-06, "loss": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.8359375, "completions/mean_terminated_length": 224.13580322265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06413021683692932, "epoch": 0.25096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1278620551068878e-06, "loss": 0.0, "num_tokens": 140496643.0, "reward": 0.49308690428733826, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49308690428733826, "rewards/reward_fn/std": 1.0006216764450073, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06336656957864761, "epoch": 0.25104, "grad_norm": 0.0, "learning_rate": 3.1275765818805424e-06, "loss": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.640625, "completions/mean_terminated_length": 237.05154418945312, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.0738794319331646, "epoch": 0.25112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.127291012238387e-06, "loss": 0.0, "num_tokens": 140593109.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07248782366514206, "epoch": 0.2512, "grad_norm": 0.0, "learning_rate": 3.127005346200407e-06, "loss": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.0859375, "completions/mean_terminated_length": 201.3000030517578, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07109517604112625, "epoch": 0.25128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1267195837865968e-06, "loss": 0.0, "num_tokens": 140687584.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07220370322465897, "epoch": 0.25136, "grad_norm": 0.0, "learning_rate": 3.126433725016957e-06, "loss": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.265625, "completions/mean_terminated_length": 238.11904907226562, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.06838779151439667, "epoch": 0.25144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1261477699114943e-06, "loss": 0.0, "num_tokens": 140784386.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07036495208740234, "epoch": 0.25152, "grad_norm": 0.0, "learning_rate": 3.125861718490222e-06, "loss": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.53125, "completions/mean_terminated_length": 213.16505432128906, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.0659663937985897, "epoch": 0.2516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.125575570773162e-06, "loss": 0.0, "num_tokens": 140878278.0, "reward": 0.4691332280635834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4691332280635834, "rewards/reward_fn/std": 0.9917086362838745, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805944442749023, "epoch": 0.25168, "grad_norm": 0.0, "learning_rate": 3.12528932678034e-06, "loss": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.390625, "completions/mean_terminated_length": 205.2761993408203, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06843940913677216, "epoch": 0.25176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1250029865317916e-06, "loss": 0.0, "num_tokens": 140971256.0, "reward": 0.7886883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7886883616447449, "rewards/reward_fn/std": 1.2856351137161255, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06738188117742538, "epoch": 0.25184, "grad_norm": 0.0, "learning_rate": 3.1247165500475567e-06, "loss": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 201.8515625, "completions/mean_terminated_length": 184.54638671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06542938202619553, "epoch": 0.25192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1244300173476833e-06, "loss": 0.0, "num_tokens": 141062629.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06369229033589363, "epoch": 0.252, "grad_norm": 0.0, "learning_rate": 3.124143388452225e-06, "loss": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.953125, "completions/mean_terminated_length": 237.4216766357422, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06623466312885284, "epoch": 0.25208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1238566633812435e-06, "loss": 0.0, "num_tokens": 141159391.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0650639757514, "epoch": 0.25216, "grad_norm": 0.0, "learning_rate": 3.1235698421548064e-06, "loss": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.515625, "completions/mean_terminated_length": 213.1058807373047, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0689433179795742, "epoch": 0.25224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1232829247929873e-06, "loss": 0.0, "num_tokens": 141254049.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07135536521673203, "epoch": 0.25232, "grad_norm": 0.0, "learning_rate": 3.122995911315869e-06, "loss": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.6171875, "completions/mean_terminated_length": 223.46739196777344, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.07569457590579987, "epoch": 0.2524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1227088017435376e-06, "loss": 0.0, "num_tokens": 141349360.0, "reward": 0.11692613363265991, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11692613363265991, "rewards/reward_fn/std": 0.310573011636734, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07458057254552841, "epoch": 0.25248, "grad_norm": 0.0, "learning_rate": 3.122421596096089e-06, "loss": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.4296875, "completions/mean_terminated_length": 239.62686157226562, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.07335465773940086, "epoch": 0.25256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.122134294393624e-06, "loss": 0.0, "num_tokens": 141446567.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102075219154358, "epoch": 0.25264, "grad_norm": 0.0, "learning_rate": 3.121846896656251e-06, "loss": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.8984375, "completions/mean_terminated_length": 226.2210693359375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07065967097878456, "epoch": 0.25272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1215594029040846e-06, "loss": 0.0, "num_tokens": 141542042.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07057971507310867, "epoch": 0.2528, "grad_norm": 0.0, "learning_rate": 3.121271813157247e-06, "loss": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.921875, "completions/mean_terminated_length": 198.42857360839844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06515387818217278, "epoch": 0.25288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.120984127435865e-06, "loss": 0.0, "num_tokens": 141634704.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0634281113743782, "epoch": 0.25296, "grad_norm": 0.0, "learning_rate": 3.1206963457600747e-06, "loss": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.0390625, "completions/mean_terminated_length": 238.8968963623047, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07188450917601585, "epoch": 0.25304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.120408468150018e-06, "loss": 0.0, "num_tokens": 141731349.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032283395528793, "epoch": 0.25312, "grad_norm": 0.0, "learning_rate": 3.120120494625842e-06, "loss": 0.0, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 215.6171875, "completions/mean_terminated_length": 203.78787231445312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06893589720129967, "epoch": 0.2532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.119832425207703e-06, "loss": 0.0, "num_tokens": 141824484.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06793243065476418, "epoch": 0.25328, "grad_norm": 0.0, "learning_rate": 3.119544259915762e-06, "loss": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.9140625, "completions/mean_terminated_length": 209.09524536132812, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06554703041911125, "epoch": 0.25336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1192559987701886e-06, "loss": 0.0, "num_tokens": 141919833.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06454185396432877, "epoch": 0.25344, "grad_norm": 0.0, "learning_rate": 3.118967641791157e-06, "loss": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.6875, "completions/mean_terminated_length": 215.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06575483828783035, "epoch": 0.25352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.11867918899885e-06, "loss": 0.0, "num_tokens": 142015537.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07116707414388657, "epoch": 0.2536, "grad_norm": 0.0, "learning_rate": 3.118390640413455e-06, "loss": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 226.5411834716797, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.06637559086084366, "epoch": 0.25368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1181019960551684e-06, "loss": 0.0, "num_tokens": 142111337.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06578591093420982, "epoch": 0.25376, "grad_norm": 0.0, "learning_rate": 3.117813255944192e-06, "loss": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.2578125, "completions/mean_terminated_length": 216.21737670898438, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06874924898147583, "epoch": 0.25384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1175244201007345e-06, "loss": 0.0, "num_tokens": 142205066.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06731562316417694, "epoch": 0.25392, "grad_norm": 0.0, "learning_rate": 3.1172354885450115e-06, "loss": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.6640625, "completions/mean_terminated_length": 208.9439239501953, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06445249542593956, "epoch": 0.254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1169464612972447e-06, "loss": 0.0, "num_tokens": 142298335.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06567161902785301, "epoch": 0.25408, "grad_norm": 0.0, "learning_rate": 3.1166573383776634e-06, "loss": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5234375, "completions/mean_terminated_length": 232.96511840820312, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.07133708521723747, "epoch": 0.25416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.116368119806503e-06, "loss": 0.0, "num_tokens": 142394658.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06939449161291122, "epoch": 0.25424, "grad_norm": 0.0, "learning_rate": 3.116078805604006e-06, "loss": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.453125, "completions/mean_terminated_length": 200.64077758789062, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06753382831811905, "epoch": 0.25432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.115789395790421e-06, "loss": 0.0, "num_tokens": 142487260.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0689769834280014, "epoch": 0.2544, "grad_norm": 0.0, "learning_rate": 3.115499890386004e-06, "loss": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 221.703125, "completions/mean_terminated_length": 209.78948974609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.07661576196551323, "epoch": 0.25448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1152102894110166e-06, "loss": 0.0, "num_tokens": 142581174.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07478129118680954, "epoch": 0.25456, "grad_norm": 0.0, "learning_rate": 3.1149205928857286e-06, "loss": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.640625, "completions/mean_terminated_length": 215.33334350585938, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07179739326238632, "epoch": 0.25464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1146308008304156e-06, "loss": 0.0, "num_tokens": 142675208.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07220049202442169, "epoch": 0.25472, "grad_norm": 0.0, "learning_rate": 3.1143409132653594e-06, "loss": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.8125, "completions/mean_terminated_length": 214.61727905273438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07438052073121071, "epoch": 0.2548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1140509302108494e-06, "loss": 0.0, "num_tokens": 142770160.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0726441740989685, "epoch": 0.25488, "grad_norm": 0.0, "learning_rate": 3.1137608516871814e-06, "loss": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.921875, "completions/mean_terminated_length": 220.33334350585938, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06901867315173149, "epoch": 0.25496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1134706777146584e-06, "loss": 0.0, "num_tokens": 142865254.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0720277652144432, "epoch": 0.25504, "grad_norm": 0.0, "learning_rate": 3.1131804083135883e-06, "loss": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.6875, "completions/mean_terminated_length": 212.8205108642578, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06584404408931732, "epoch": 0.25512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1128900435042877e-06, "loss": 0.0, "num_tokens": 142960190.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06621222198009491, "epoch": 0.2552, "grad_norm": 0.0, "learning_rate": 3.112599583307079e-06, "loss": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.984375, "completions/mean_terminated_length": 201.5749969482422, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07200495153665543, "epoch": 0.25528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.112309027742291e-06, "loss": 0.0, "num_tokens": 143054140.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805980205535889, "epoch": 0.25536, "grad_norm": 0.0, "learning_rate": 3.1120183768302598e-06, "loss": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.328125, "completions/mean_terminated_length": 195.54762268066406, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06640078127384186, "epoch": 0.25544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1117276305913284e-06, "loss": 0.0, "num_tokens": 143147366.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06971593573689461, "epoch": 0.25552, "grad_norm": 0.0, "learning_rate": 3.1114367890458443e-06, "loss": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.7109375, "completions/mean_terminated_length": 220.08433532714844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07191572338342667, "epoch": 0.2556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1111458522141656e-06, "loss": 0.0, "num_tokens": 143242689.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0705629363656044, "epoch": 0.25568, "grad_norm": 0.0, "learning_rate": 3.110854820116653e-06, "loss": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 196.1796875, "completions/mean_terminated_length": 167.98851013183594, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06768359988927841, "epoch": 0.25576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.110563692773677e-06, "loss": 0.0, "num_tokens": 143333336.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06837523356080055, "epoch": 0.25584, "grad_norm": 0.0, "learning_rate": 3.110272470205612e-06, "loss": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.4140625, "completions/mean_terminated_length": 238.34544372558594, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06557631120085716, "epoch": 0.25592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1099811524328416e-06, "loss": 0.0, "num_tokens": 143430669.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06638897210359573, "epoch": 0.256, "grad_norm": 0.0, "learning_rate": 3.1096897394757545e-06, "loss": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.8984375, "completions/mean_terminated_length": 231.2739715576172, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06546115130186081, "epoch": 0.25608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1093982313547466e-06, "loss": 0.0, "num_tokens": 143527168.0, "reward": 0.9020648002624512, "reward_std": 0.0, "rewards/reward_fn/mean": 0.9020648002624512, "rewards/reward_fn/std": 1.2567732334136963, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06448971480131149, "epoch": 0.25616, "grad_norm": 0.0, "learning_rate": 3.1091066280902208e-06, "loss": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.171875, "completions/mean_terminated_length": 243.44444274902344, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06764229759573936, "epoch": 0.25624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.108814929702586e-06, "loss": 0.0, "num_tokens": 143624342.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06732309609651566, "epoch": 0.25632, "grad_norm": 0.0, "learning_rate": 3.1085231362122576e-06, "loss": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 244.59375, "completions/mean_terminated_length": 232.82540893554688, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.0740463025867939, "epoch": 0.2564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1082312476396582e-06, "loss": 0.0, "num_tokens": 143721186.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07273328304290771, "epoch": 0.25648, "grad_norm": 0.0, "learning_rate": 3.1079392640052174e-06, "loss": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.7265625, "completions/mean_terminated_length": 238.73626708984375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06574786454439163, "epoch": 0.25656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.107647185329371e-06, "loss": 0.0, "num_tokens": 143817919.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06279782764613628, "epoch": 0.25664, "grad_norm": 0.0, "learning_rate": 3.107355011632561e-06, "loss": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 202.7109375, "completions/mean_terminated_length": 193.42201232910156, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06251689046621323, "epoch": 0.25672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1070627429352358e-06, "loss": 0.0, "num_tokens": 143909402.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06000448949635029, "epoch": 0.2568, "grad_norm": 0.0, "learning_rate": 3.1067703792578528e-06, "loss": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 242.2421875, "completions/mean_terminated_length": 231.19717407226562, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.067982979118824, "epoch": 0.25688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.106477920620873e-06, "loss": 0.0, "num_tokens": 144005945.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0661151371896267, "epoch": 0.25696, "grad_norm": 0.0, "learning_rate": 3.1061853670447657e-06, "loss": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.03125, "completions/mean_terminated_length": 231.5955047607422, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.07351033017039299, "epoch": 0.25704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1058927185500067e-06, "loss": 0.0, "num_tokens": 144102077.0, "reward": 0.49431151151657104, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49431151151657104, "rewards/reward_fn/std": 1.0011838674545288, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06932905316352844, "epoch": 0.25712, "grad_norm": 0.0, "learning_rate": 3.1055999751570784e-06, "loss": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 200.9453125, "completions/mean_terminated_length": 194.7217254638672, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07487403228878975, "epoch": 0.2572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1053071368864706e-06, "loss": 0.0, "num_tokens": 144193334.0, "reward": 1.1299972534179688, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1299972534179688, "rewards/reward_fn/std": 1.4542447328567505, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07250453531742096, "epoch": 0.25728, "grad_norm": 0.0, "learning_rate": 3.1050142037586767e-06, "loss": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.1796875, "completions/mean_terminated_length": 228.0699920654297, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.0749058686196804, "epoch": 0.25736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.104721175794201e-06, "loss": 0.0, "num_tokens": 144288845.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07461762055754662, "epoch": 0.25744, "grad_norm": 0.0, "learning_rate": 3.104428053013551e-06, "loss": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.4453125, "completions/mean_terminated_length": 226.43820190429688, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.06285609677433968, "epoch": 0.25752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.104134835437243e-06, "loss": 0.0, "num_tokens": 144384518.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06115576811134815, "epoch": 0.2576, "grad_norm": 0.0, "learning_rate": 3.1038415230857996e-06, "loss": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.3515625, "completions/mean_terminated_length": 211.81927490234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06939048692584038, "epoch": 0.25768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.103548115979748e-06, "loss": 0.0, "num_tokens": 144479155.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07057221606373787, "epoch": 0.25776, "grad_norm": 0.0, "learning_rate": 3.1032546141396244e-06, "loss": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.8828125, "completions/mean_terminated_length": 201.35643005371094, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06808945536613464, "epoch": 0.25784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1029610175859707e-06, "loss": 0.0, "num_tokens": 144571940.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06745270639657974, "epoch": 0.25792, "grad_norm": 0.0, "learning_rate": 3.1026673263393365e-06, "loss": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4375, "completions/mean_terminated_length": 236.11764526367188, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.08104865252971649, "epoch": 0.258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.102373540420276e-06, "loss": 0.0, "num_tokens": 144668892.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08046553656458855, "epoch": 0.25808, "grad_norm": 0.0, "learning_rate": 3.102079659849351e-06, "loss": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.9609375, "completions/mean_terminated_length": 207.36111450195312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06227400712668896, "epoch": 0.25816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1017856846471313e-06, "loss": 0.0, "num_tokens": 144761943.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06552331894636154, "epoch": 0.25824, "grad_norm": 0.0, "learning_rate": 3.101491614834191e-06, "loss": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.234375, "completions/mean_terminated_length": 185.57647705078125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05543212406337261, "epoch": 0.25832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1011974504311117e-06, "loss": 0.0, "num_tokens": 144854261.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0594203844666481, "epoch": 0.2584, "grad_norm": 0.0, "learning_rate": 3.1009031914584825e-06, "loss": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.953125, "completions/mean_terminated_length": 229.6666717529297, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.062158893793821335, "epoch": 0.25848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.100608837936898e-06, "loss": 0.0, "num_tokens": 144950511.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06464975327253342, "epoch": 0.25856, "grad_norm": 0.0, "learning_rate": 3.1003143898869597e-06, "loss": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.7421875, "completions/mean_terminated_length": 232.08045959472656, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.062148671597242355, "epoch": 0.25864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.1000198473292765e-06, "loss": 0.0, "num_tokens": 145046734.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06272437795996666, "epoch": 0.25872, "grad_norm": 0.0, "learning_rate": 3.099725210284463e-06, "loss": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.9609375, "completions/mean_terminated_length": 234.61111450195312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.060754718258976936, "epoch": 0.2588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0994304787731405e-06, "loss": 0.0, "num_tokens": 145143113.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0613922905176878, "epoch": 0.25888, "grad_norm": 0.0, "learning_rate": 3.099135652815937e-06, "loss": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.078125, "completions/mean_terminated_length": 209.96397399902344, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07200564444065094, "epoch": 0.25896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0988407324334876e-06, "loss": 0.0, "num_tokens": 145236307.0, "reward": 0.41141408681869507, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41141408681869507, "rewards/reward_fn/std": 0.9868917465209961, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07378847524523735, "epoch": 0.25904, "grad_norm": 0.0, "learning_rate": 3.098545717646433e-06, "loss": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.84375, "completions/mean_terminated_length": 187.70211791992188, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07187173888087273, "epoch": 0.25912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.098250608475422e-06, "loss": 0.0, "num_tokens": 145328191.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06664274632930756, "epoch": 0.2592, "grad_norm": 0.0, "learning_rate": 3.0979554049411076e-06, "loss": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.78125, "completions/mean_terminated_length": 243.17071533203125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.06658424809575081, "epoch": 0.25928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.097660107064153e-06, "loss": 0.0, "num_tokens": 145425443.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805162876844406, "epoch": 0.25936, "grad_norm": 0.0, "learning_rate": 3.0973647148652246e-06, "loss": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.9921875, "completions/mean_terminated_length": 238.09677124023438, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.0687658078968525, "epoch": 0.25944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.097069228364997e-06, "loss": 0.0, "num_tokens": 145522082.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06469947472214699, "epoch": 0.25952, "grad_norm": 0.0, "learning_rate": 3.0967736475841516e-06, "loss": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 199.54022216796875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06957365572452545, "epoch": 0.2596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0964779725433747e-06, "loss": 0.0, "num_tokens": 145615474.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07609241083264351, "epoch": 0.25968, "grad_norm": 0.0, "learning_rate": 3.096182203263361e-06, "loss": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.4921875, "completions/mean_terminated_length": 201.96104431152344, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.0680534578859806, "epoch": 0.25976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0958863397648127e-06, "loss": 0.0, "num_tokens": 145709617.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06954298540949821, "epoch": 0.25984, "grad_norm": 0.0, "learning_rate": 3.095590382068435e-06, "loss": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 247.0234375, "completions/mean_terminated_length": 237.16392517089844, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06638198345899582, "epoch": 0.25992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0952943301949427e-06, "loss": 0.0, "num_tokens": 145806772.0, "reward": 0.11442755907773972, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11442755907773972, "rewards/reward_fn/std": 0.3039364516735077, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06732837110757828, "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 3.0949981841650564e-06, "loss": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 212.8125, "completions/mean_terminated_length": 205.74545288085938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07197660207748413, "epoch": 0.26008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0947019439995033e-06, "loss": 0.0, "num_tokens": 145899548.0, "reward": 0.47473567724227905, "reward_std": 0.0, "rewards/reward_fn/mean": 0.47473567724227905, "rewards/reward_fn/std": 0.9914607405662537, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07150556892156601, "epoch": 0.26016, "grad_norm": 0.0, "learning_rate": 3.0944056097190165e-06, "loss": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.5625, "completions/mean_terminated_length": 213.1707305908203, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07435078173875809, "epoch": 0.26024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0941091813443366e-06, "loss": 0.0, "num_tokens": 145994340.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0752703845500946, "epoch": 0.26032, "grad_norm": 0.0, "learning_rate": 3.0938126588962107e-06, "loss": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.0546875, "completions/mean_terminated_length": 203.81689453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.06769749894738197, "epoch": 0.2604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.093516042395392e-06, "loss": 0.0, "num_tokens": 146088939.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06492029130458832, "epoch": 0.26048, "grad_norm": 0.0, "learning_rate": 3.093219331862641e-06, "loss": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.1796875, "completions/mean_terminated_length": 213.558837890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07323884218931198, "epoch": 0.26056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0929225273187234e-06, "loss": 0.0, "num_tokens": 146182914.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0785786435008049, "epoch": 0.26064, "grad_norm": 0.0, "learning_rate": 3.0926256287844127e-06, "loss": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 229.70114135742188, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06346847116947174, "epoch": 0.26072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0923286362804887e-06, "loss": 0.0, "num_tokens": 146278930.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06341231241822243, "epoch": 0.2608, "grad_norm": 0.0, "learning_rate": 3.092031549827738e-06, "loss": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3203125, "completions/mean_terminated_length": 223.4329833984375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07124194502830505, "epoch": 0.26088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0917343694469535e-06, "loss": 0.0, "num_tokens": 146374075.0, "reward": 0.05376052483916283, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05376052483916283, "rewards/reward_fn/std": 0.1427958756685257, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07306434959173203, "epoch": 0.26096, "grad_norm": 0.0, "learning_rate": 3.0914370951589345e-06, "loss": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 216.3984375, "completions/mean_terminated_length": 178.015380859375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06589827686548233, "epoch": 0.26104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0911397269844867e-06, "loss": 0.0, "num_tokens": 146467310.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0719708539545536, "epoch": 0.26112, "grad_norm": 0.0, "learning_rate": 3.0908422649444236e-06, "loss": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.359375, "completions/mean_terminated_length": 212.32098388671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06995141878724098, "epoch": 0.2612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0905447090595635e-06, "loss": 0.0, "num_tokens": 146562076.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06787288188934326, "epoch": 0.26128, "grad_norm": 0.0, "learning_rate": 3.090247059350733e-06, "loss": 0.0, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 207.72413635253906, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07789439707994461, "epoch": 0.26136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0899493158387643e-06, "loss": 0.0, "num_tokens": 146656180.0, "reward": 0.051705554127693176, "reward_std": 0.0, "rewards/reward_fn/mean": 0.051705554127693176, "rewards/reward_fn/std": 0.1373375654220581, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0755755864083767, "epoch": 0.26144, "grad_norm": 0.0, "learning_rate": 3.089651478544495e-06, "loss": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.328125, "completions/mean_terminated_length": 187.20407104492188, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06065406650304794, "epoch": 0.26152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0893535474887725e-06, "loss": 0.0, "num_tokens": 146747742.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0629091951996088, "epoch": 0.2616, "grad_norm": 0.0, "learning_rate": 3.0890555226924473e-06, "loss": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 222.78260803222656, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07647693529725075, "epoch": 0.26168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0887574041763794e-06, "loss": 0.0, "num_tokens": 146842990.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07702819630503654, "epoch": 0.26176, "grad_norm": 0.0, "learning_rate": 3.0884591919614324e-06, "loss": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.828125, "completions/mean_terminated_length": 200.4038543701172, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06274329125881195, "epoch": 0.26184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.088160886068479e-06, "loss": 0.0, "num_tokens": 146935512.0, "reward": 0.76492840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.76492840051651, "rewards/reward_fn/std": 1.296067476272583, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06341362744569778, "epoch": 0.26192, "grad_norm": 0.0, "learning_rate": 3.087862486518397e-06, "loss": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.796875, "completions/mean_terminated_length": 204.02020263671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07196547836065292, "epoch": 0.262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.087563993332072e-06, "loss": 0.0, "num_tokens": 147028670.0, "reward": 0.125, "reward_std": 0.0, "rewards/reward_fn/mean": 0.125, "rewards/reward_fn/std": 0.3320184051990509, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06921093165874481, "epoch": 0.26208, "grad_norm": 0.0, "learning_rate": 3.0872654065303944e-06, "loss": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.6875, "completions/mean_terminated_length": 219.38027954101562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.0702383704483509, "epoch": 0.26216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.086966726134263e-06, "loss": 0.0, "num_tokens": 147124374.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07042442262172699, "epoch": 0.26224, "grad_norm": 0.0, "learning_rate": 3.0866679521645813e-06, "loss": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.34375, "completions/mean_terminated_length": 205.59091186523438, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06743687391281128, "epoch": 0.26232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.086369084642261e-06, "loss": 0.0, "num_tokens": 147218242.0, "reward": 0.11011891067028046, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11011891067028046, "rewards/reward_fn/std": 0.2924920320510864, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102243229746819, "epoch": 0.2624, "grad_norm": 0.0, "learning_rate": 3.086070123588219e-06, "loss": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.453125, "completions/mean_terminated_length": 236.6505889892578, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.0786827988922596, "epoch": 0.26248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.085771069023381e-06, "loss": 0.0, "num_tokens": 147314940.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07826124876737595, "epoch": 0.26256, "grad_norm": 0.0, "learning_rate": 3.0854719209686755e-06, "loss": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4296875, "completions/mean_terminated_length": 238.6538543701172, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07139116525650024, "epoch": 0.26264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0851726794450415e-06, "loss": 0.0, "num_tokens": 147411891.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06999862939119339, "epoch": 0.26272, "grad_norm": 0.0, "learning_rate": 3.084873344473422e-06, "loss": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.8671875, "completions/mean_terminated_length": 237.05503845214844, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.07172422483563423, "epoch": 0.2628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0845739160747667e-06, "loss": 0.0, "num_tokens": 147508130.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06867489218711853, "epoch": 0.26288, "grad_norm": 0.0, "learning_rate": 3.0842743942700336e-06, "loss": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.3828125, "completions/mean_terminated_length": 221.5858612060547, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.06729097664356232, "epoch": 0.26296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0839747790801854e-06, "loss": 0.0, "num_tokens": 147603027.0, "reward": 0.11482523381710052, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11482523381710052, "rewards/reward_fn/std": 0.3049927055835724, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06628137454390526, "epoch": 0.26304, "grad_norm": 0.0, "learning_rate": 3.0836750705261923e-06, "loss": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.1171875, "completions/mean_terminated_length": 228.52703857421875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06888815015554428, "epoch": 0.26312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.08337526862903e-06, "loss": 0.0, "num_tokens": 147699298.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0685909353196621, "epoch": 0.2632, "grad_norm": 0.0, "learning_rate": 3.0830753734096824e-06, "loss": 0.0, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.4140625, "completions/mean_terminated_length": 208.50962829589844, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06965020671486855, "epoch": 0.26328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0827753848891385e-06, "loss": 0.0, "num_tokens": 147792663.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07172418385744095, "epoch": 0.26336, "grad_norm": 0.0, "learning_rate": 3.0824753030883943e-06, "loss": 0.0, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.3515625, "completions/mean_terminated_length": 202.42308044433594, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0791306346654892, "epoch": 0.26344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0821751280284526e-06, "loss": 0.0, "num_tokens": 147886788.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08166230469942093, "epoch": 0.26352, "grad_norm": 0.0, "learning_rate": 3.081874859730323e-06, "loss": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 240.43243408203125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.0710124671459198, "epoch": 0.2636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.081574498215019e-06, "loss": 0.0, "num_tokens": 147983940.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0720360167324543, "epoch": 0.26368, "grad_norm": 0.0, "learning_rate": 3.0812740435035658e-06, "loss": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.65625, "completions/mean_terminated_length": 189.01266479492188, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07016097009181976, "epoch": 0.26376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0809734956169893e-06, "loss": 0.0, "num_tokens": 148076952.0, "reward": 0.8749481439590454, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8749481439590454, "rewards/reward_fn/std": 1.273591160774231, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06989838927984238, "epoch": 0.26384, "grad_norm": 0.0, "learning_rate": 3.0806728545763263e-06, "loss": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.5390625, "completions/mean_terminated_length": 230.06930541992188, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06153763271868229, "epoch": 0.26392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.080372120402618e-06, "loss": 0.0, "num_tokens": 148172637.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061768531799316406, "epoch": 0.264, "grad_norm": 0.0, "learning_rate": 3.0800712931169125e-06, "loss": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.609375, "completions/mean_terminated_length": 195.03636169433594, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07050756737589836, "epoch": 0.26408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.079770372740265e-06, "loss": 0.0, "num_tokens": 148264235.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102623954415321, "epoch": 0.26416, "grad_norm": 0.0, "learning_rate": 3.0794693592937365e-06, "loss": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.828125, "completions/mean_terminated_length": 215.4414520263672, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07147959247231483, "epoch": 0.26424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.079168252798394e-06, "loss": 0.0, "num_tokens": 148358037.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07244141772389412, "epoch": 0.26432, "grad_norm": 0.0, "learning_rate": 3.078867053275313e-06, "loss": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.7734375, "completions/mean_terminated_length": 225.73403930664062, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06852549687027931, "epoch": 0.2644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.078565760745574e-06, "loss": 0.0, "num_tokens": 148453496.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06637933850288391, "epoch": 0.26448, "grad_norm": 0.0, "learning_rate": 3.0782643752302632e-06, "loss": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.46875, "completions/mean_terminated_length": 201.62789916992188, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06945040076971054, "epoch": 0.26456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0779628967504764e-06, "loss": 0.0, "num_tokens": 148547124.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07199371233582497, "epoch": 0.26464, "grad_norm": 0.0, "learning_rate": 3.077661325327312e-06, "loss": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.890625, "completions/mean_terminated_length": 218.62789916992188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06751428544521332, "epoch": 0.26472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.077359660981878e-06, "loss": 0.0, "num_tokens": 148642214.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06923948600888252, "epoch": 0.2648, "grad_norm": 0.0, "learning_rate": 3.077057903735287e-06, "loss": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.7578125, "completions/mean_terminated_length": 213.48684692382812, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06757545843720436, "epoch": 0.26488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0767560536086593e-06, "loss": 0.0, "num_tokens": 148737287.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06575917452573776, "epoch": 0.26496, "grad_norm": 0.0, "learning_rate": 3.0764541106231205e-06, "loss": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.046875, "completions/mean_terminated_length": 209.36509704589844, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.067723847925663, "epoch": 0.26504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.076152074799805e-06, "loss": 0.0, "num_tokens": 148832653.0, "reward": 0.8749990463256836, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8749990463256836, "rewards/reward_fn/std": 1.273596167564392, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663447193801403, "epoch": 0.26512, "grad_norm": 0.0, "learning_rate": 3.07584994615985e-06, "loss": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.078125, "completions/mean_terminated_length": 217.56625366210938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07174348831176758, "epoch": 0.2652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.075547724724403e-06, "loss": 0.0, "num_tokens": 148927767.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.6640368103981018, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0728357769548893, "epoch": 0.26528, "grad_norm": 0.0, "learning_rate": 3.0752454105146157e-06, "loss": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.0546875, "completions/mean_terminated_length": 217.5205535888672, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07192858681082726, "epoch": 0.26536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0749430035516465e-06, "loss": 0.0, "num_tokens": 149023262.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06932850182056427, "epoch": 0.26544, "grad_norm": 0.0, "learning_rate": 3.0746405038566614e-06, "loss": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.4453125, "completions/mean_terminated_length": 219.2133331298828, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07632201164960861, "epoch": 0.26552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0743379114508324e-06, "loss": 0.0, "num_tokens": 149118807.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07608529180288315, "epoch": 0.2656, "grad_norm": 0.0, "learning_rate": 3.0740352263553356e-06, "loss": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 200.9166717529297, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.05824328027665615, "epoch": 0.26568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.073732448591359e-06, "loss": 0.0, "num_tokens": 149211823.0, "reward": 0.19254472851753235, "reward_std": 0.0, "rewards/reward_fn/mean": 0.19254472851753235, "rewards/reward_fn/std": 0.3466104567050934, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05799748934805393, "epoch": 0.26576, "grad_norm": 0.0, "learning_rate": 3.073429578180092e-06, "loss": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.328125, "completions/mean_terminated_length": 219.27906799316406, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06373658031225204, "epoch": 0.26584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.073126615142732e-06, "loss": 0.0, "num_tokens": 149306969.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0672677606344223, "epoch": 0.26592, "grad_norm": 0.0, "learning_rate": 3.0728235595004843e-06, "loss": 0.0, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.9609375, "completions/mean_terminated_length": 213.82894897460938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07521406188607216, "epoch": 0.266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0725204112745592e-06, "loss": 0.0, "num_tokens": 149402068.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07603112608194351, "epoch": 0.26608, "grad_norm": 0.0, "learning_rate": 3.072217170486173e-06, "loss": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.5078125, "completions/mean_terminated_length": 186.718994140625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06498340144753456, "epoch": 0.26616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0719138371565506e-06, "loss": 0.0, "num_tokens": 149491989.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06542176753282547, "epoch": 0.26624, "grad_norm": 0.0, "learning_rate": 3.0716104113069217e-06, "loss": 0.0, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.15625, "completions/mean_terminated_length": 186.6521759033203, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06048855930566788, "epoch": 0.26632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0713068929585225e-06, "loss": 0.0, "num_tokens": 149583913.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06254055164754391, "epoch": 0.2664, "grad_norm": 0.0, "learning_rate": 3.0710032821325972e-06, "loss": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.5703125, "completions/mean_terminated_length": 209.83168029785156, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.066648680716753, "epoch": 0.26648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0706995788503936e-06, "loss": 0.0, "num_tokens": 149677554.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06990045681595802, "epoch": 0.26656, "grad_norm": 0.0, "learning_rate": 3.0703957831331696e-06, "loss": 0.0, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.46875, "completions/mean_terminated_length": 218.3505096435547, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.060679841786623, "epoch": 0.26664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0700918950021866e-06, "loss": 0.0, "num_tokens": 149772206.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06294888630509377, "epoch": 0.26672, "grad_norm": 0.0, "learning_rate": 3.0697879144787136e-06, "loss": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4765625, "completions/mean_terminated_length": 239.16250610351562, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06646046042442322, "epoch": 0.2668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.069483841584026e-06, "loss": 0.0, "num_tokens": 149869163.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0641733780503273, "epoch": 0.26688, "grad_norm": 0.0, "learning_rate": 3.069179676339406e-06, "loss": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.7265625, "completions/mean_terminated_length": 181.8000030517578, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.062215544283390045, "epoch": 0.26696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0688754187661417e-06, "loss": 0.0, "num_tokens": 149961160.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06210572086274624, "epoch": 0.26704, "grad_norm": 0.0, "learning_rate": 3.0685710688855283e-06, "loss": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.171875, "completions/mean_terminated_length": 232.56790161132812, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.0686853900551796, "epoch": 0.26712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.068266626718867e-06, "loss": 0.0, "num_tokens": 150057566.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06638319417834282, "epoch": 0.2672, "grad_norm": 0.0, "learning_rate": 3.067962092287465e-06, "loss": 0.0, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.984375, "completions/mean_terminated_length": 235.85714721679688, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07186299562454224, "epoch": 0.26728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.067657465612637e-06, "loss": 0.0, "num_tokens": 150154460.0, "reward": 0.04315175488591194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04315175488591194, "rewards/reward_fn/std": 0.11461740732192993, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07200220599770546, "epoch": 0.26736, "grad_norm": 0.0, "learning_rate": 3.067352746715703e-06, "loss": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.7265625, "completions/mean_terminated_length": 183.1774139404297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06797048822045326, "epoch": 0.26744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0670479356179914e-06, "loss": 0.0, "num_tokens": 150248249.0, "reward": 0.4633024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4633024334907532, "rewards/reward_fn/std": 0.9901458024978638, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06320810690522194, "epoch": 0.26752, "grad_norm": 0.0, "learning_rate": 3.0667430323408342e-06, "loss": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.796875, "completions/mean_terminated_length": 233.88636779785156, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07276605069637299, "epoch": 0.2676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.066438036905573e-06, "loss": 0.0, "num_tokens": 150344607.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07221702486276627, "epoch": 0.26768, "grad_norm": 0.0, "learning_rate": 3.066132949333553e-06, "loss": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6015625, "completions/mean_terminated_length": 221.86904907226562, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06992626190185547, "epoch": 0.26776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.065827769646127e-06, "loss": 0.0, "num_tokens": 150440044.0, "reward": 0.05169707536697388, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05169707536697388, "rewards/reward_fn/std": 0.0910414531826973, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0717787928879261, "epoch": 0.26784, "grad_norm": 0.0, "learning_rate": 3.0655224978646558e-06, "loss": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 206.37681579589844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06446267664432526, "epoch": 0.26792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.065217134010504e-06, "loss": 0.0, "num_tokens": 150534924.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06628986448049545, "epoch": 0.268, "grad_norm": 0.0, "learning_rate": 3.064911678105044e-06, "loss": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.5078125, "completions/mean_terminated_length": 220.62353515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06584542989730835, "epoch": 0.26808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0646061301696546e-06, "loss": 0.0, "num_tokens": 150630221.0, "reward": 0.08013462275266647, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08013462275266647, "rewards/reward_fn/std": 0.21284934878349304, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06762085109949112, "epoch": 0.26816, "grad_norm": 0.0, "learning_rate": 3.064300490225721e-06, "loss": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.6171875, "completions/mean_terminated_length": 241.06741333007812, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.07131645828485489, "epoch": 0.26824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0639947582946345e-06, "loss": 0.0, "num_tokens": 150727196.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07027328014373779, "epoch": 0.26832, "grad_norm": 0.0, "learning_rate": 3.0636889343977936e-06, "loss": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.453125, "completions/mean_terminated_length": 223.91893005371094, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06897877156734467, "epoch": 0.2684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0633830185566025e-06, "loss": 0.0, "num_tokens": 150823126.0, "reward": 0.08158833533525467, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08158833533525467, "rewards/reward_fn/std": 0.21671062707901, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06860855966806412, "epoch": 0.26848, "grad_norm": 0.0, "learning_rate": 3.063077010792472e-06, "loss": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 237.05262756347656, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.07311467826366425, "epoch": 0.26856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0627709111268196e-06, "loss": 0.0, "num_tokens": 150919990.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07240289077162743, "epoch": 0.26864, "grad_norm": 0.0, "learning_rate": 3.062464719581069e-06, "loss": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.0546875, "completions/mean_terminated_length": 238.07041931152344, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07420869916677475, "epoch": 0.26872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.06215843617665e-06, "loss": 0.0, "num_tokens": 151017021.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07420870289206505, "epoch": 0.2688, "grad_norm": 0.0, "learning_rate": 3.0618520609349995e-06, "loss": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.71875, "completions/mean_terminated_length": 224.8518524169922, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07234493643045425, "epoch": 0.26888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0615455938775604e-06, "loss": 0.0, "num_tokens": 151111961.0, "reward": 0.11220657825469971, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11220657825469971, "rewards/reward_fn/std": 0.29803720116615295, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07101158052682877, "epoch": 0.26896, "grad_norm": 0.0, "learning_rate": 3.0612390350257827e-06, "loss": 0.0, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.671875, "completions/mean_terminated_length": 207.4166717529297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06920799985527992, "epoch": 0.26904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0609323844011213e-06, "loss": 0.0, "num_tokens": 151206767.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07135576009750366, "epoch": 0.26912, "grad_norm": 0.0, "learning_rate": 3.060625642025039e-06, "loss": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.4921875, "completions/mean_terminated_length": 232.10447692871094, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07162216678261757, "epoch": 0.2692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.060318807919005e-06, "loss": 0.0, "num_tokens": 151303470.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07462265715003014, "epoch": 0.26928, "grad_norm": 0.0, "learning_rate": 3.060011882104494e-06, "loss": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.6796875, "completions/mean_terminated_length": 236.41893005371094, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07026379555463791, "epoch": 0.26936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.059704864602988e-06, "loss": 0.0, "num_tokens": 151400325.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06863358244299889, "epoch": 0.26944, "grad_norm": 0.0, "learning_rate": 3.0593977554359737e-06, "loss": 0.0, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4375, "completions/mean_terminated_length": 237.4794464111328, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07202885299921036, "epoch": 0.26952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0590905546249467e-06, "loss": 0.0, "num_tokens": 151497277.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07061794400215149, "epoch": 0.2696, "grad_norm": 0.0, "learning_rate": 3.0587832621914073e-06, "loss": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 219.65625, "completions/mean_terminated_length": 202.5287322998047, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06981631368398666, "epoch": 0.26968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0584758781568633e-06, "loss": 0.0, "num_tokens": 151590929.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07207687571644783, "epoch": 0.26976, "grad_norm": 0.0, "learning_rate": 3.0581684025428276e-06, "loss": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.15625, "completions/mean_terminated_length": 238.86419677734375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06420670822262764, "epoch": 0.26984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.057860835370821e-06, "loss": 0.0, "num_tokens": 151687845.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06794020533561707, "epoch": 0.26992, "grad_norm": 0.0, "learning_rate": 3.0575531766623693e-06, "loss": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 215.3086395263672, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06408421695232391, "epoch": 0.27, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0572454264390057e-06, "loss": 0.0, "num_tokens": 151782853.0, "reward": 0.8703233599662781, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8703233599662781, "rewards/reward_fn/std": 1.2731940746307373, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06602941080927849, "epoch": 0.27008, "grad_norm": 0.0, "learning_rate": 3.056937584722269e-06, "loss": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.8671875, "completions/mean_terminated_length": 231.8800048828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06514300405979156, "epoch": 0.27016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.056629651533706e-06, "loss": 0.0, "num_tokens": 151879348.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06484497711062431, "epoch": 0.27024, "grad_norm": 0.0, "learning_rate": 3.056321626894867e-06, "loss": 0.0, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.6640625, "completions/mean_terminated_length": 212.08738708496094, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06888197362422943, "epoch": 0.27032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0560135108273125e-06, "loss": 0.0, "num_tokens": 151973129.0, "reward": 0.12644658982753754, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12644658982753754, "rewards/reward_fn/std": 0.3209598660469055, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06684256717562675, "epoch": 0.2704, "grad_norm": 0.0, "learning_rate": 3.0557053033526055e-06, "loss": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 226.0618438720703, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07267001643776894, "epoch": 0.27048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.055397004492319e-06, "loss": 0.0, "num_tokens": 152068529.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07154685258865356, "epoch": 0.27056, "grad_norm": 0.0, "learning_rate": 3.055088614268029e-06, "loss": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.4296875, "completions/mean_terminated_length": 234.28750610351562, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.0738193728029728, "epoch": 0.27064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0547801327013207e-06, "loss": 0.0, "num_tokens": 152165096.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07089021801948547, "epoch": 0.27072, "grad_norm": 0.0, "learning_rate": 3.054471559813784e-06, "loss": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.0078125, "completions/mean_terminated_length": 197.09524536132812, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07804827392101288, "epoch": 0.2708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.054162895627016e-06, "loss": 0.0, "num_tokens": 152259689.0, "reward": 0.597317099571228, "reward_std": 0.0, "rewards/reward_fn/mean": 0.597317099571228, "rewards/reward_fn/std": 0.9868561625480652, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07721440121531487, "epoch": 0.27088, "grad_norm": 0.0, "learning_rate": 3.05385414016262e-06, "loss": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.5390625, "completions/mean_terminated_length": 231.12632751464844, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06786364316940308, "epoch": 0.27096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0535452934422054e-06, "loss": 0.0, "num_tokens": 152355630.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06658584624528885, "epoch": 0.27104, "grad_norm": 0.0, "learning_rate": 3.053236355487389e-06, "loss": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.4140625, "completions/mean_terminated_length": 236.7525634765625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06853882968425751, "epoch": 0.27112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.052927326319791e-06, "loss": 0.0, "num_tokens": 152452067.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06810342520475388, "epoch": 0.2712, "grad_norm": 0.0, "learning_rate": 3.052618205961043e-06, "loss": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.6328125, "completions/mean_terminated_length": 240.32693481445312, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.07203011959791183, "epoch": 0.27128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0523089944327777e-06, "loss": 0.0, "num_tokens": 152549556.0, "reward": 0.4384971857070923, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4384971857070923, "rewards/reward_fn/std": 0.9861915111541748, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07183044031262398, "epoch": 0.27136, "grad_norm": 0.0, "learning_rate": 3.051999691756639e-06, "loss": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.140625, "completions/mean_terminated_length": 204.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06298473477363586, "epoch": 0.27144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0516902979542726e-06, "loss": 0.0, "num_tokens": 152642886.0, "reward": 0.45800459384918213, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45800459384918213, "rewards/reward_fn/std": 0.9889340400695801, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06322492472827435, "epoch": 0.27152, "grad_norm": 0.0, "learning_rate": 3.051380813047334e-06, "loss": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1171875, "completions/mean_terminated_length": 232.44285583496094, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06559576466679573, "epoch": 0.2716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.051071237057484e-06, "loss": 0.0, "num_tokens": 152739541.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06500093638896942, "epoch": 0.27168, "grad_norm": 0.0, "learning_rate": 3.050761570006389e-06, "loss": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.9140625, "completions/mean_terminated_length": 235.5731658935547, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07080965116620064, "epoch": 0.27176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0504518119157227e-06, "loss": 0.0, "num_tokens": 152836170.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06772588565945625, "epoch": 0.27184, "grad_norm": 0.0, "learning_rate": 3.0501419628071654e-06, "loss": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.4453125, "completions/mean_terminated_length": 222.26922607421875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06699375808238983, "epoch": 0.27192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0498320227024024e-06, "loss": 0.0, "num_tokens": 152931843.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06785440444946289, "epoch": 0.272, "grad_norm": 0.0, "learning_rate": 3.0495219916231274e-06, "loss": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.6484375, "completions/mean_terminated_length": 196.7083282470703, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06458587199449539, "epoch": 0.27208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0492118695910373e-06, "loss": 0.0, "num_tokens": 153025878.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061284014955163, "epoch": 0.27216, "grad_norm": 0.0, "learning_rate": 3.0489016566278397e-06, "loss": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1015625, "completions/mean_terminated_length": 233.68919372558594, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07347005605697632, "epoch": 0.27224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.048591352755244e-06, "loss": 0.0, "num_tokens": 153122531.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0727284699678421, "epoch": 0.27232, "grad_norm": 0.0, "learning_rate": 3.04828095799497e-06, "loss": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.984375, "completions/mean_terminated_length": 228.26803588867188, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06971221417188644, "epoch": 0.2724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0479704723687417e-06, "loss": 0.0, "num_tokens": 153218145.0, "reward": 0.05404704809188843, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05404704809188843, "rewards/reward_fn/std": 0.09590700268745422, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07140756398439407, "epoch": 0.27248, "grad_norm": 0.0, "learning_rate": 3.047659895898289e-06, "loss": 0.0, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.4140625, "completions/mean_terminated_length": 224.2210693359375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06477059051394463, "epoch": 0.27256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0473492286053498e-06, "loss": 0.0, "num_tokens": 153313430.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0684799812734127, "epoch": 0.27264, "grad_norm": 0.0, "learning_rate": 3.0470384705116666e-06, "loss": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.9609375, "completions/mean_terminated_length": 235.80899047851562, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.0699552372097969, "epoch": 0.27272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0467276216389903e-06, "loss": 0.0, "num_tokens": 153409937.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06979486718773842, "epoch": 0.2728, "grad_norm": 0.0, "learning_rate": 3.046416682009076e-06, "loss": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.7421875, "completions/mean_terminated_length": 224.32977294921875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.07293207198381424, "epoch": 0.27288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0461056516436872e-06, "loss": 0.0, "num_tokens": 153505264.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07178407162427902, "epoch": 0.27296, "grad_norm": 0.0, "learning_rate": 3.045794530564592e-06, "loss": 0.0, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.5625, "completions/mean_terminated_length": 233.36842346191406, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.07290490344166756, "epoch": 0.27304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0454833187935663e-06, "loss": 0.0, "num_tokens": 153601848.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07296759262681007, "epoch": 0.27312, "grad_norm": 0.0, "learning_rate": 3.0451720163523905e-06, "loss": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.2734375, "completions/mean_terminated_length": 226.63954162597656, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07665326818823814, "epoch": 0.2732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0448606232628536e-06, "loss": 0.0, "num_tokens": 153697627.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.075425885617733, "epoch": 0.27328, "grad_norm": 0.0, "learning_rate": 3.044549139546749e-06, "loss": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.390625, "completions/mean_terminated_length": 236.3170623779297, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.072350163012743, "epoch": 0.27336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.044237565225878e-06, "loss": 0.0, "num_tokens": 153794317.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07172346115112305, "epoch": 0.27344, "grad_norm": 0.0, "learning_rate": 3.043925900322047e-06, "loss": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 192.1796875, "completions/mean_terminated_length": 165.23333740234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06385781988501549, "epoch": 0.27352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.04361414485707e-06, "loss": 0.0, "num_tokens": 153884452.0, "reward": 1.2487332820892334, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2487332820892334, "rewards/reward_fn/std": 1.3976428508758545, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664270892739296, "epoch": 0.2736, "grad_norm": 0.0, "learning_rate": 3.0433022988527655e-06, "loss": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.546875, "completions/mean_terminated_length": 211.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.0646987110376358, "epoch": 0.27368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.04299036233096e-06, "loss": 0.0, "num_tokens": 153979882.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06393971666693687, "epoch": 0.27376, "grad_norm": 0.0, "learning_rate": 3.042678335313486e-06, "loss": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.78125, "completions/mean_terminated_length": 202.88235473632812, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07362780719995499, "epoch": 0.27384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.042366217822181e-06, "loss": 0.0, "num_tokens": 154074574.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07265811413526535, "epoch": 0.27392, "grad_norm": 0.0, "learning_rate": 3.042054009878891e-06, "loss": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 208.3047637939453, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07189725339412689, "epoch": 0.274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.041741711505467e-06, "loss": 0.0, "num_tokens": 154167870.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06896991282701492, "epoch": 0.27408, "grad_norm": 0.0, "learning_rate": 3.041429322723767e-06, "loss": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.9296875, "completions/mean_terminated_length": 234.27272033691406, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.06918351352214813, "epoch": 0.27416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0411168435556542e-06, "loss": 0.0, "num_tokens": 154264501.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07029915228486061, "epoch": 0.27424, "grad_norm": 0.0, "learning_rate": 3.040804274022999e-06, "loss": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 238.20407104492188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06430962681770325, "epoch": 0.27432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0404916141476783e-06, "loss": 0.0, "num_tokens": 154361061.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06795680150389671, "epoch": 0.2744, "grad_norm": 0.0, "learning_rate": 3.0401788639515744e-06, "loss": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.265625, "completions/mean_terminated_length": 227.02084350585938, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06231346353888512, "epoch": 0.27448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.039866023456577e-06, "loss": 0.0, "num_tokens": 154456583.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06539661809802055, "epoch": 0.27456, "grad_norm": 0.0, "learning_rate": 3.039553092684582e-06, "loss": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.703125, "completions/mean_terminated_length": 195.68084716796875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.060946447774767876, "epoch": 0.27464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0392400716574905e-06, "loss": 0.0, "num_tokens": 154549217.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.062003057450056076, "epoch": 0.27472, "grad_norm": 0.0, "learning_rate": 3.038926960397211e-06, "loss": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.34375, "completions/mean_terminated_length": 233.3191375732422, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06252987682819366, "epoch": 0.2748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0386137589256577e-06, "loss": 0.0, "num_tokens": 154645389.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06373182311654091, "epoch": 0.27488, "grad_norm": 0.0, "learning_rate": 3.038300467264752e-06, "loss": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.59375, "completions/mean_terminated_length": 232.7916717529297, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.07072257623076439, "epoch": 0.27496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.03798708543642e-06, "loss": 0.0, "num_tokens": 154741465.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0701938048005104, "epoch": 0.27504, "grad_norm": 0.0, "learning_rate": 3.0376736134625957e-06, "loss": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.4140625, "completions/mean_terminated_length": 233.23170471191406, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07042818516492844, "epoch": 0.27512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0373600513652194e-06, "loss": 0.0, "num_tokens": 154837902.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0719078816473484, "epoch": 0.2752, "grad_norm": 0.0, "learning_rate": 3.0370463991662363e-06, "loss": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.859375, "completions/mean_terminated_length": 229.3103485107422, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06471001729369164, "epoch": 0.27528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.036732656887599e-06, "loss": 0.0, "num_tokens": 154933884.0, "reward": 0.08953723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08953723311424255, "rewards/reward_fn/std": 0.23782405257225037, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06384261697530746, "epoch": 0.27536, "grad_norm": 0.0, "learning_rate": 3.0364188245512668e-06, "loss": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.078125, "completions/mean_terminated_length": 219.55844116210938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.06846858561038971, "epoch": 0.27544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.036104902179203e-06, "loss": 0.0, "num_tokens": 155029382.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06604675203561783, "epoch": 0.27552, "grad_norm": 0.0, "learning_rate": 3.0357908897933807e-06, "loss": 0.0, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.5546875, "completions/mean_terminated_length": 233.93589782714844, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.07155037671327591, "epoch": 0.2756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0354767874157765e-06, "loss": 0.0, "num_tokens": 155125965.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07396824285387993, "epoch": 0.27568, "grad_norm": 0.0, "learning_rate": 3.0351625950683743e-06, "loss": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.2421875, "completions/mean_terminated_length": 202.3000030517578, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06627286598086357, "epoch": 0.27576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.034848312773164e-06, "loss": 0.0, "num_tokens": 155219436.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06709276884794235, "epoch": 0.27584, "grad_norm": 0.0, "learning_rate": 3.034533940552143e-06, "loss": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.2265625, "completions/mean_terminated_length": 226.45631408691406, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07242291420698166, "epoch": 0.27592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.034219478427313e-06, "loss": 0.0, "num_tokens": 155314697.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07286978885531425, "epoch": 0.276, "grad_norm": 0.0, "learning_rate": 3.0339049264206833e-06, "loss": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.1640625, "completions/mean_terminated_length": 205.8988800048828, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06768376380205154, "epoch": 0.27608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0335902845542695e-06, "loss": 0.0, "num_tokens": 155408542.0, "reward": 0.41141408681869507, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41141408681869507, "rewards/reward_fn/std": 0.9868917465209961, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06625936925411224, "epoch": 0.27616, "grad_norm": 0.0, "learning_rate": 3.0332755528500934e-06, "loss": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.7890625, "completions/mean_terminated_length": 231.7912139892578, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06846749037504196, "epoch": 0.27624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.032960731330182e-06, "loss": 0.0, "num_tokens": 155504643.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06990865617990494, "epoch": 0.27632, "grad_norm": 0.0, "learning_rate": 3.0326458200165703e-06, "loss": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.3828125, "completions/mean_terminated_length": 213.76315307617188, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06967643275856972, "epoch": 0.2764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0323308189312983e-06, "loss": 0.0, "num_tokens": 155598132.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0681118592619896, "epoch": 0.27648, "grad_norm": 0.0, "learning_rate": 3.0320157280964138e-06, "loss": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.8984375, "completions/mean_terminated_length": 203.86956787109375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07018252089619637, "epoch": 0.27656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.031700547533968e-06, "loss": 0.0, "num_tokens": 155692839.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0712725855410099, "epoch": 0.27664, "grad_norm": 0.0, "learning_rate": 3.031385277266021e-06, "loss": 0.0, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 235.18031311035156, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07719145715236664, "epoch": 0.27672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.03106991731464e-06, "loss": 0.0, "num_tokens": 155789873.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07453331351280212, "epoch": 0.2768, "grad_norm": 0.0, "learning_rate": 3.030754467701894e-06, "loss": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.6953125, "completions/mean_terminated_length": 232.01817321777344, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06701699271798134, "epoch": 0.27688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.030438928449863e-06, "loss": 0.0, "num_tokens": 155886858.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06592585518956184, "epoch": 0.27696, "grad_norm": 0.0, "learning_rate": 3.0301232995806313e-06, "loss": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.5234375, "completions/mean_terminated_length": 205.2058868408203, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07275347411632538, "epoch": 0.27704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0298075811162892e-06, "loss": 0.0, "num_tokens": 155979981.0, "reward": 0.7974936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7974936366081238, "rewards/reward_fn/std": 1.2825363874435425, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07567575201392174, "epoch": 0.27712, "grad_norm": 0.0, "learning_rate": 3.0294917730789334e-06, "loss": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 238.8515625, "completions/mean_terminated_length": 227.85897827148438, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.0734679326415062, "epoch": 0.2772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.029175875490667e-06, "loss": 0.0, "num_tokens": 156076090.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07573465630412102, "epoch": 0.27728, "grad_norm": 0.0, "learning_rate": 3.0288598883736013e-06, "loss": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 237.53125, "completions/mean_terminated_length": 227.85714721679688, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.06298239529132843, "epoch": 0.27736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.02854381174985e-06, "loss": 0.0, "num_tokens": 156172030.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06232801452279091, "epoch": 0.27744, "grad_norm": 0.0, "learning_rate": 3.028227645641536e-06, "loss": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.46875, "completions/mean_terminated_length": 186.35000610351562, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06371744349598885, "epoch": 0.27752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0279113900707877e-06, "loss": 0.0, "num_tokens": 156264762.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0652744509279728, "epoch": 0.2776, "grad_norm": 0.0, "learning_rate": 3.027595045059739e-06, "loss": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 215.31430053710938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06595122069120407, "epoch": 0.27768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0272786106305312e-06, "loss": 0.0, "num_tokens": 156358794.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06579180061817169, "epoch": 0.27776, "grad_norm": 0.0, "learning_rate": 3.0269620868053105e-06, "loss": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 216.3984375, "completions/mean_terminated_length": 194.92770385742188, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06399042904376984, "epoch": 0.27784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.026645473606232e-06, "loss": 0.0, "num_tokens": 156452029.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06577427685260773, "epoch": 0.27792, "grad_norm": 0.0, "learning_rate": 3.0263287710554544e-06, "loss": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8359375, "completions/mean_terminated_length": 211.8142852783203, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06392507813870907, "epoch": 0.278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0260119791751425e-06, "loss": 0.0, "num_tokens": 156547240.0, "reward": 0.7746719121932983, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7746719121932983, "rewards/reward_fn/std": 1.2914265394210815, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06287877634167671, "epoch": 0.27808, "grad_norm": 0.0, "learning_rate": 3.02569509798747e-06, "loss": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.6796875, "completions/mean_terminated_length": 230.99038696289062, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06982195749878883, "epoch": 0.27816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0253781275146147e-06, "loss": 0.0, "num_tokens": 156642943.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07111677527427673, "epoch": 0.27824, "grad_norm": 0.0, "learning_rate": 3.0250610677787604e-06, "loss": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 175.1666717529297, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06843562796711922, "epoch": 0.27832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.024743918802099e-06, "loss": 0.0, "num_tokens": 156733487.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06363430805504322, "epoch": 0.2784, "grad_norm": 0.0, "learning_rate": 3.0244266806068267e-06, "loss": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 199.703125, "completions/mean_terminated_length": 181.71133422851562, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07062230631709099, "epoch": 0.27848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0241093532151476e-06, "loss": 0.0, "num_tokens": 156824585.0, "reward": 1.214537262916565, "reward_std": 0.0, "rewards/reward_fn/mean": 1.214537262916565, "rewards/reward_fn/std": 1.4069468975067139, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0651499293744564, "epoch": 0.27856, "grad_norm": 0.0, "learning_rate": 3.023791936649271e-06, "loss": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.6953125, "completions/mean_terminated_length": 231.59375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.0660693608224392, "epoch": 0.27864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.023474430931412e-06, "loss": 0.0, "num_tokens": 156920546.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06865399330854416, "epoch": 0.27872, "grad_norm": 0.0, "learning_rate": 3.023156836083794e-06, "loss": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.9375, "completions/mean_terminated_length": 242.30303955078125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06815600395202637, "epoch": 0.2788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.022839152128644e-06, "loss": 0.0, "num_tokens": 157017946.0, "reward": 0.0722954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0722954273223877, "rewards/reward_fn/std": 0.192027285695076, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06807485222816467, "epoch": 0.27888, "grad_norm": 0.0, "learning_rate": 3.0225213790881973e-06, "loss": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 221.02565002441406, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07237723469734192, "epoch": 0.27896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.022203516984694e-06, "loss": 0.0, "num_tokens": 157113522.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07339314371347427, "epoch": 0.27904, "grad_norm": 0.0, "learning_rate": 3.0218855658403823e-06, "loss": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.71875, "completions/mean_terminated_length": 213.41177368164062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07731486856937408, "epoch": 0.27912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0215675256775144e-06, "loss": 0.0, "num_tokens": 157208206.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07496598362922668, "epoch": 0.2792, "grad_norm": 0.0, "learning_rate": 3.02124939651835e-06, "loss": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6484375, "completions/mean_terminated_length": 221.83750915527344, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07814410701394081, "epoch": 0.27928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0209311783851543e-06, "loss": 0.0, "num_tokens": 157303777.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07938215509057045, "epoch": 0.27936, "grad_norm": 0.0, "learning_rate": 3.0206128713002e-06, "loss": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.6953125, "completions/mean_terminated_length": 217.20999145507812, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06393321044743061, "epoch": 0.27944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.020294475285765e-06, "loss": 0.0, "num_tokens": 157398202.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06403707712888718, "epoch": 0.27952, "grad_norm": 0.0, "learning_rate": 3.0199759903641338e-06, "loss": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.9609375, "completions/mean_terminated_length": 239.56976318359375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.0717644989490509, "epoch": 0.2796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.019657416557596e-06, "loss": 0.0, "num_tokens": 157495093.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07007641717791557, "epoch": 0.27968, "grad_norm": 0.0, "learning_rate": 3.0193387538884502e-06, "loss": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.578125, "completions/mean_terminated_length": 188.2153778076172, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07058186084032059, "epoch": 0.27976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0190200023789974e-06, "loss": 0.0, "num_tokens": 157588991.0, "reward": 0.7931517362594604, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7931517362594604, "rewards/reward_fn/std": 1.2840120792388916, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07082688063383102, "epoch": 0.27984, "grad_norm": 0.0, "learning_rate": 3.018701162051548e-06, "loss": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.5625, "completions/mean_terminated_length": 206.47190856933594, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07179228216409683, "epoch": 0.27992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0183822329284177e-06, "loss": 0.0, "num_tokens": 157682887.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06798940896987915, "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 3.0180632150319277e-06, "loss": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.96875, "completions/mean_terminated_length": 191.94871520996094, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07012061402201653, "epoch": 0.28008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.017744108384406e-06, "loss": 0.0, "num_tokens": 157776195.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07162048667669296, "epoch": 0.28016, "grad_norm": 0.0, "learning_rate": 3.017424913008186e-06, "loss": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.359375, "completions/mean_terminated_length": 220.40000915527344, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07187926024198532, "epoch": 0.28024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.017105628925609e-06, "loss": 0.0, "num_tokens": 157871473.0, "reward": 0.7524996995925903, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7524996995925903, "rewards/reward_fn/std": 1.302709698677063, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07211232930421829, "epoch": 0.28032, "grad_norm": 0.0, "learning_rate": 3.016786256159021e-06, "loss": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.5078125, "completions/mean_terminated_length": 209.64285278320312, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0698452964425087, "epoch": 0.2804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0164667947307754e-06, "loss": 0.0, "num_tokens": 157965234.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06831124797463417, "epoch": 0.28048, "grad_norm": 0.0, "learning_rate": 3.01614724466323e-06, "loss": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.15625, "completions/mean_terminated_length": 222.9615478515625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06774397566914558, "epoch": 0.28056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.015827605978751e-06, "loss": 0.0, "num_tokens": 158060102.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06868665292859077, "epoch": 0.28064, "grad_norm": 0.0, "learning_rate": 3.015507878699709e-06, "loss": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.6953125, "completions/mean_terminated_length": 196.5394744873047, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.060984231531620026, "epoch": 0.28072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0151880628484814e-06, "loss": 0.0, "num_tokens": 158153887.0, "reward": 0.42564278841018677, "reward_std": 0.0, "rewards/reward_fn/mean": 0.42564278841018677, "rewards/reward_fn/std": 0.9832224249839783, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06353635527193546, "epoch": 0.2808, "grad_norm": 0.0, "learning_rate": 3.014868158447453e-06, "loss": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.4140625, "completions/mean_terminated_length": 212.05435180664062, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06300792098045349, "epoch": 0.28088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.014548165519012e-06, "loss": 0.0, "num_tokens": 158248148.0, "reward": 0.4970853924751282, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4970853924751282, "rewards/reward_fn/std": 1.0024950504302979, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061009302735328674, "epoch": 0.28096, "grad_norm": 0.0, "learning_rate": 3.0142280840855564e-06, "loss": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.40625, "completions/mean_terminated_length": 207.80001831054688, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06690190732479095, "epoch": 0.28104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0139079141694877e-06, "loss": 0.0, "num_tokens": 158343560.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.065481748431921, "epoch": 0.28112, "grad_norm": 0.0, "learning_rate": 3.0135876557932146e-06, "loss": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.390625, "completions/mean_terminated_length": 214.6571502685547, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07291325554251671, "epoch": 0.2812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.013267308979151e-06, "loss": 0.0, "num_tokens": 158438970.0, "reward": 0.415934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.415934681892395, "rewards/reward_fn/std": 0.9864103198051453, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07127373665571213, "epoch": 0.28128, "grad_norm": 0.0, "learning_rate": 3.0129468737497183e-06, "loss": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.9921875, "completions/mean_terminated_length": 238.16456604003906, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.07260199263691902, "epoch": 0.28136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0126263501273445e-06, "loss": 0.0, "num_tokens": 158535865.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06749250739812851, "epoch": 0.28144, "grad_norm": 0.0, "learning_rate": 3.0123057381344616e-06, "loss": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.96875, "completions/mean_terminated_length": 230.08509826660156, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06880873814225197, "epoch": 0.28152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0119850377935093e-06, "loss": 0.0, "num_tokens": 158631733.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07050587981939316, "epoch": 0.2816, "grad_norm": 0.0, "learning_rate": 3.0116642491269335e-06, "loss": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.765625, "completions/mean_terminated_length": 195.05264282226562, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.060419997200369835, "epoch": 0.28168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0113433721571865e-06, "loss": 0.0, "num_tokens": 158724247.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06436969712376595, "epoch": 0.28176, "grad_norm": 0.0, "learning_rate": 3.011022406906725e-06, "loss": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.4140625, "completions/mean_terminated_length": 230.1413116455078, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07195287570357323, "epoch": 0.28184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0107013533980144e-06, "loss": 0.0, "num_tokens": 158820172.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07178857550024986, "epoch": 0.28192, "grad_norm": 0.0, "learning_rate": 3.0103802116535244e-06, "loss": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.859375, "completions/mean_terminated_length": 234.43589782714844, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.06769256666302681, "epoch": 0.282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0100589816957316e-06, "loss": 0.0, "num_tokens": 158916794.0, "reward": 0.44033318758010864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.44033318758010864, "rewards/reward_fn/std": 0.9863339066505432, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06670790538191795, "epoch": 0.28208, "grad_norm": 0.0, "learning_rate": 3.009737663547119e-06, "loss": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.2109375, "completions/mean_terminated_length": 214.9479217529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06411165744066238, "epoch": 0.28216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.009416257230175e-06, "loss": 0.0, "num_tokens": 159011157.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06381019949913025, "epoch": 0.28224, "grad_norm": 0.0, "learning_rate": 3.0090947627673953e-06, "loss": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.53125, "completions/mean_terminated_length": 235.27272033691406, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06892899423837662, "epoch": 0.28232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0087731801812807e-06, "loss": 0.0, "num_tokens": 159107865.0, "reward": 0.05971250310540199, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05971250310540199, "rewards/reward_fn/std": 0.1586051881313324, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06452898681163788, "epoch": 0.2824, "grad_norm": 0.0, "learning_rate": 3.008451509494339e-06, "loss": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.6484375, "completions/mean_terminated_length": 239.296875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.0665072351694107, "epoch": 0.28248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0081297507290825e-06, "loss": 0.0, "num_tokens": 159205100.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06863067299127579, "epoch": 0.28256, "grad_norm": 0.0, "learning_rate": 3.0078079039080327e-06, "loss": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.640625, "completions/mean_terminated_length": 219.97589111328125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06613185256719589, "epoch": 0.28264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.007485969053714e-06, "loss": 0.0, "num_tokens": 159300414.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0685345008969307, "epoch": 0.28272, "grad_norm": 0.0, "learning_rate": 3.007163946188659e-06, "loss": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 229.760009765625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.07229920104146004, "epoch": 0.2828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.006841835335407e-06, "loss": 0.0, "num_tokens": 159396750.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07199234515428543, "epoch": 0.28288, "grad_norm": 0.0, "learning_rate": 3.0065196365165e-06, "loss": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.640625, "completions/mean_terminated_length": 230.45359802246094, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.06467125937342644, "epoch": 0.28296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.00619734975449e-06, "loss": 0.0, "num_tokens": 159492576.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06558716669678688, "epoch": 0.28304, "grad_norm": 0.0, "learning_rate": 3.005874975071934e-06, "loss": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.2421875, "completions/mean_terminated_length": 191.6666717529297, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06400328688323498, "epoch": 0.28312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.005552512491394e-06, "loss": 0.0, "num_tokens": 159584511.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06597157195210457, "epoch": 0.2832, "grad_norm": 0.0, "learning_rate": 3.005229962035439e-06, "loss": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.3359375, "completions/mean_terminated_length": 217.10293579101562, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0768517516553402, "epoch": 0.28328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.004907323726645e-06, "loss": 0.0, "num_tokens": 159680170.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0732460618019104, "epoch": 0.28336, "grad_norm": 0.0, "learning_rate": 3.004584597587592e-06, "loss": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.8203125, "completions/mean_terminated_length": 207.31068420410156, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06506237387657166, "epoch": 0.28344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0042617836408682e-06, "loss": 0.0, "num_tokens": 159773459.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06650543957948685, "epoch": 0.28352, "grad_norm": 0.0, "learning_rate": 3.0039388819090676e-06, "loss": 0.0, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.515625, "completions/mean_terminated_length": 225.32652282714844, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.05938169173896313, "epoch": 0.2836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.003615892414789e-06, "loss": 0.0, "num_tokens": 159868757.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059898409992456436, "epoch": 0.28368, "grad_norm": 0.0, "learning_rate": 3.003292815180639e-06, "loss": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.234375, "completions/mean_terminated_length": 214.1591033935547, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06804159283638, "epoch": 0.28376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.002969650229229e-06, "loss": 0.0, "num_tokens": 159963379.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07157646864652634, "epoch": 0.28384, "grad_norm": 0.0, "learning_rate": 3.0026463975831766e-06, "loss": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.7734375, "completions/mean_terminated_length": 224.989990234375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07298031821846962, "epoch": 0.28392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0023230572651075e-06, "loss": 0.0, "num_tokens": 160058582.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07472351938486099, "epoch": 0.284, "grad_norm": 0.0, "learning_rate": 3.0019996292976517e-06, "loss": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.1015625, "completions/mean_terminated_length": 241.3157958984375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.07598781958222389, "epoch": 0.28408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.001676113703445e-06, "loss": 0.0, "num_tokens": 160155491.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07604587823152542, "epoch": 0.28416, "grad_norm": 0.0, "learning_rate": 3.0013525105051303e-06, "loss": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.7890625, "completions/mean_terminated_length": 236.01100158691406, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07023418694734573, "epoch": 0.28424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0010288197253573e-06, "loss": 0.0, "num_tokens": 160251976.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0673547275364399, "epoch": 0.28432, "grad_norm": 0.0, "learning_rate": 3.0007050413867797e-06, "loss": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.7265625, "completions/mean_terminated_length": 214.36666870117188, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07012517750263214, "epoch": 0.2844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.0003811755120595e-06, "loss": 0.0, "num_tokens": 160346533.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06982225924730301, "epoch": 0.28448, "grad_norm": 0.0, "learning_rate": 3.0000572221238632e-06, "loss": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.5625, "completions/mean_terminated_length": 216.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07320525869727135, "epoch": 0.28456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9997331812448648e-06, "loss": 0.0, "num_tokens": 160441837.0, "reward": 0.0833282396197319, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0833282396197319, "rewards/reward_fn/std": 0.16935697197914124, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06689980626106262, "epoch": 0.28464, "grad_norm": 0.0, "learning_rate": 2.999409052897743e-06, "loss": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.09375, "completions/mean_terminated_length": 235.3023223876953, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07865544781088829, "epoch": 0.28472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9990848371051843e-06, "loss": 0.0, "num_tokens": 160538361.0, "reward": 0.0982079803943634, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0982079803943634, "rewards/reward_fn/std": 0.2608548700809479, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08058789372444153, "epoch": 0.2848, "grad_norm": 0.0, "learning_rate": 2.9987605338898795e-06, "loss": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.1640625, "completions/mean_terminated_length": 228.82142639160156, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07946543395519257, "epoch": 0.28488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.998436143274527e-06, "loss": 0.0, "num_tokens": 160634382.0, "reward": 0.46453723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46453723311424255, "rewards/reward_fn/std": 0.990456759929657, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0799054242670536, "epoch": 0.28496, "grad_norm": 0.0, "learning_rate": 2.9981116652818303e-06, "loss": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.3046875, "completions/mean_terminated_length": 233.183349609375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06316839903593063, "epoch": 0.28504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9977870999345e-06, "loss": 0.0, "num_tokens": 160731317.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0655074305832386, "epoch": 0.28512, "grad_norm": 0.0, "learning_rate": 2.9974624472552508e-06, "loss": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.9609375, "completions/mean_terminated_length": 205.15516662597656, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06341061741113663, "epoch": 0.2852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9971377072668066e-06, "loss": 0.0, "num_tokens": 160826672.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060194261372089386, "epoch": 0.28528, "grad_norm": 0.0, "learning_rate": 2.9968128799918947e-06, "loss": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.9453125, "completions/mean_terminated_length": 219.35165405273438, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06916772201657295, "epoch": 0.28536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9964879654532506e-06, "loss": 0.0, "num_tokens": 160921641.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06767318770289421, "epoch": 0.28544, "grad_norm": 0.0, "learning_rate": 2.9961629636736146e-06, "loss": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.1953125, "completions/mean_terminated_length": 195.12222290039062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06749272346496582, "epoch": 0.28552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9958378746757325e-06, "loss": 0.0, "num_tokens": 161014466.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07025464624166489, "epoch": 0.2856, "grad_norm": 0.0, "learning_rate": 2.995512698482358e-06, "loss": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.0234375, "completions/mean_terminated_length": 230.435302734375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07628165930509567, "epoch": 0.28568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9951874351162497e-06, "loss": 0.0, "num_tokens": 161110597.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07497604563832283, "epoch": 0.28576, "grad_norm": 0.0, "learning_rate": 2.994862084600172e-06, "loss": 0.0, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.3046875, "completions/mean_terminated_length": 217.8543701171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06609369069337845, "epoch": 0.28584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.994536646956897e-06, "loss": 0.0, "num_tokens": 161204972.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06780217215418816, "epoch": 0.28592, "grad_norm": 0.0, "learning_rate": 2.994211122209202e-06, "loss": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.515625, "completions/mean_terminated_length": 205.90000915527344, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07375194132328033, "epoch": 0.286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9938855103798693e-06, "loss": 0.0, "num_tokens": 161300270.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0697094202041626, "epoch": 0.28608, "grad_norm": 0.0, "learning_rate": 2.9935598114916887e-06, "loss": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.84375, "completions/mean_terminated_length": 214.12046813964844, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07609636709094048, "epoch": 0.28616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9932340255674562e-06, "loss": 0.0, "num_tokens": 161395098.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07503808289766312, "epoch": 0.28624, "grad_norm": 0.0, "learning_rate": 2.9929081526299724e-06, "loss": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 219.1999969482422, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07326911762356758, "epoch": 0.28632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9925821927020456e-06, "loss": 0.0, "num_tokens": 161490826.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07575764507055283, "epoch": 0.2864, "grad_norm": 0.0, "learning_rate": 2.99225614580649e-06, "loss": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.265625, "completions/mean_terminated_length": 224.40476989746094, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.05861775763332844, "epoch": 0.28648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9919300119661243e-06, "loss": 0.0, "num_tokens": 161586476.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06271883100271225, "epoch": 0.28656, "grad_norm": 0.0, "learning_rate": 2.9916037912037755e-06, "loss": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 216.1012725830078, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06510190665721893, "epoch": 0.28664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.991277483542275e-06, "loss": 0.0, "num_tokens": 161681628.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06559870019555092, "epoch": 0.28672, "grad_norm": 0.0, "learning_rate": 2.990951089004461e-06, "loss": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.328125, "completions/mean_terminated_length": 222.42666625976562, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07232875004410744, "epoch": 0.2868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9906246076131775e-06, "loss": 0.0, "num_tokens": 161777414.0, "reward": 0.017386555671691895, "reward_std": 0.0, "rewards/reward_fn/mean": 0.017386555671691895, "rewards/reward_fn/std": 0.04618125036358833, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07289931550621986, "epoch": 0.28688, "grad_norm": 0.0, "learning_rate": 2.9902980393912748e-06, "loss": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.078125, "completions/mean_terminated_length": 214.48191833496094, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.08071227744221687, "epoch": 0.28696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.98997138436161e-06, "loss": 0.0, "num_tokens": 161872272.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08027135580778122, "epoch": 0.28704, "grad_norm": 0.0, "learning_rate": 2.9896446425470445e-06, "loss": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.46875, "completions/mean_terminated_length": 194.54054260253906, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.061769528314471245, "epoch": 0.28712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.989317813970447e-06, "loss": 0.0, "num_tokens": 161966028.0, "reward": 0.09073717892169952, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09073717892169952, "rewards/reward_fn/std": 0.24101130664348602, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.064136803150177, "epoch": 0.2872, "grad_norm": 0.0, "learning_rate": 2.9889908986546924e-06, "loss": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.28125, "completions/mean_terminated_length": 231.759033203125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07090167701244354, "epoch": 0.28728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.988663896622661e-06, "loss": 0.0, "num_tokens": 162062320.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0680454857647419, "epoch": 0.28736, "grad_norm": 0.0, "learning_rate": 2.988336807897239e-06, "loss": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.3359375, "completions/mean_terminated_length": 219.13592529296875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07045191526412964, "epoch": 0.28744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.988009632501321e-06, "loss": 0.0, "num_tokens": 162156827.0, "reward": 0.09931214898824692, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09931214898824692, "rewards/reward_fn/std": 0.22196801006793976, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07053368166089058, "epoch": 0.28752, "grad_norm": 0.0, "learning_rate": 2.987682370457804e-06, "loss": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.3828125, "completions/mean_terminated_length": 226.86300659179688, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.05929850786924362, "epoch": 0.2876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9873550217895937e-06, "loss": 0.0, "num_tokens": 162253004.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06208653934299946, "epoch": 0.28768, "grad_norm": 0.0, "learning_rate": 2.9870275865196004e-06, "loss": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.1484375, "completions/mean_terminated_length": 205.99020385742188, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07447874546051025, "epoch": 0.28776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9867000646707416e-06, "loss": 0.0, "num_tokens": 162346207.0, "reward": 0.8461265563964844, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8461265563964844, "rewards/reward_fn/std": 1.259202003479004, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07710399106144905, "epoch": 0.28784, "grad_norm": 0.0, "learning_rate": 2.9863724562659403e-06, "loss": 0.0, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.9140625, "completions/mean_terminated_length": 225.53607177734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.08048231527209282, "epoch": 0.28792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9860447613281257e-06, "loss": 0.0, "num_tokens": 162441556.0, "reward": 0.4620321989059448, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4620321989059448, "rewards/reward_fn/std": 0.989837110042572, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08025044575333595, "epoch": 0.288, "grad_norm": 0.0, "learning_rate": 2.9857169798802328e-06, "loss": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 206.60870361328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07049962505698204, "epoch": 0.28808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.985389111945203e-06, "loss": 0.0, "num_tokens": 162536452.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06700219213962555, "epoch": 0.28816, "grad_norm": 0.0, "learning_rate": 2.9850611575459835e-06, "loss": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.0390625, "completions/mean_terminated_length": 236.37179565429688, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06755024939775467, "epoch": 0.28824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9847331167055283e-06, "loss": 0.0, "num_tokens": 162633225.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06332114338874817, "epoch": 0.28832, "grad_norm": 0.0, "learning_rate": 2.9844049894467954e-06, "loss": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.0546875, "completions/mean_terminated_length": 207.38043212890625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06828152015805244, "epoch": 0.2884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.984076775792751e-06, "loss": 0.0, "num_tokens": 162727056.0, "reward": 0.415934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.415934681892395, "rewards/reward_fn/std": 0.9864103198051453, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06559451669454575, "epoch": 0.28848, "grad_norm": 0.0, "learning_rate": 2.9837484757663667e-06, "loss": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.109375, "completions/mean_terminated_length": 225.54348754882812, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07124429196119308, "epoch": 0.28856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9834200893906203e-06, "loss": 0.0, "num_tokens": 162822558.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0684141218662262, "epoch": 0.28864, "grad_norm": 0.0, "learning_rate": 2.983091616688495e-06, "loss": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.9453125, "completions/mean_terminated_length": 201.48275756835938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06843958795070648, "epoch": 0.28872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9827630576829802e-06, "loss": 0.0, "num_tokens": 162916119.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06795565783977509, "epoch": 0.2888, "grad_norm": 0.0, "learning_rate": 2.9824344123970716e-06, "loss": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.890625, "completions/mean_terminated_length": 239.80555725097656, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06521469727158546, "epoch": 0.28888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.982105680853772e-06, "loss": 0.0, "num_tokens": 163013257.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06502645462751389, "epoch": 0.28896, "grad_norm": 0.0, "learning_rate": 2.9817768630760873e-06, "loss": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.6328125, "completions/mean_terminated_length": 213.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06543747335672379, "epoch": 0.28904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.981447959087032e-06, "loss": 0.0, "num_tokens": 163108186.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06735740974545479, "epoch": 0.28912, "grad_norm": 0.0, "learning_rate": 2.981118968909627e-06, "loss": 0.0, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.796875, "completions/mean_terminated_length": 237.25001525878906, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07068191841244698, "epoch": 0.2892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9807898925668972e-06, "loss": 0.0, "num_tokens": 163205440.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07106533274054527, "epoch": 0.28928, "grad_norm": 0.0, "learning_rate": 2.9804607300818735e-06, "loss": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.515625, "completions/mean_terminated_length": 213.6764678955078, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07231392711400986, "epoch": 0.28936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.980131481477596e-06, "loss": 0.0, "num_tokens": 163300866.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07320123165845871, "epoch": 0.28944, "grad_norm": 0.0, "learning_rate": 2.9798021467771068e-06, "loss": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.4453125, "completions/mean_terminated_length": 209.84210205078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07454754412174225, "epoch": 0.28952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9794727260034567e-06, "loss": 0.0, "num_tokens": 163396539.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07464433461427689, "epoch": 0.2896, "grad_norm": 0.0, "learning_rate": 2.979143219179701e-06, "loss": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.1015625, "completions/mean_terminated_length": 231.13792419433594, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.06846712529659271, "epoch": 0.28968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9788136263289027e-06, "loss": 0.0, "num_tokens": 163492680.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07278136163949966, "epoch": 0.28976, "grad_norm": 0.0, "learning_rate": 2.9784839474741295e-06, "loss": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 198.58824157714844, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06718125939369202, "epoch": 0.28984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.978154182638455e-06, "loss": 0.0, "num_tokens": 163587080.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06690464168787003, "epoch": 0.28992, "grad_norm": 0.0, "learning_rate": 2.9778243318449594e-06, "loss": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.890625, "completions/mean_terminated_length": 221.80850219726562, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07545147463679314, "epoch": 0.29, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9774943951167287e-06, "loss": 0.0, "num_tokens": 163682170.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07252154499292374, "epoch": 0.29008, "grad_norm": 0.0, "learning_rate": 2.977164372476855e-06, "loss": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.5625, "completions/mean_terminated_length": 212.38462829589844, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07270438596606255, "epoch": 0.29016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9768342639484367e-06, "loss": 0.0, "num_tokens": 163775938.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07104694470763206, "epoch": 0.29024, "grad_norm": 0.0, "learning_rate": 2.976504069554578e-06, "loss": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.3203125, "completions/mean_terminated_length": 192.52565002441406, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06935213506221771, "epoch": 0.29032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9761737893183888e-06, "loss": 0.0, "num_tokens": 163869291.0, "reward": 0.4438909888267517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4438909888267517, "rewards/reward_fn/std": 0.9866783618927002, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07797552645206451, "epoch": 0.2904, "grad_norm": 0.0, "learning_rate": 2.9758434232629847e-06, "loss": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.90625, "completions/mean_terminated_length": 197.8148193359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0632760263979435, "epoch": 0.29048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.975512971411489e-06, "loss": 0.0, "num_tokens": 163961311.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06334017217159271, "epoch": 0.29056, "grad_norm": 0.0, "learning_rate": 2.9751824337870283e-06, "loss": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.453125, "completions/mean_terminated_length": 199.75555419921875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06123049929738045, "epoch": 0.29064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.974851810412738e-06, "loss": 0.0, "num_tokens": 164054553.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06344396248459816, "epoch": 0.29072, "grad_norm": 0.0, "learning_rate": 2.9745211013117575e-06, "loss": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.0546875, "completions/mean_terminated_length": 208.4534912109375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06636881083250046, "epoch": 0.2908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.974190306507234e-06, "loss": 0.0, "num_tokens": 164148768.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06690134853124619, "epoch": 0.29088, "grad_norm": 0.0, "learning_rate": 2.9738594260223185e-06, "loss": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.2578125, "completions/mean_terminated_length": 226.50650024414062, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07346218824386597, "epoch": 0.29096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9735284598801697e-06, "loss": 0.0, "num_tokens": 164244801.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07170365750789642, "epoch": 0.29104, "grad_norm": 0.0, "learning_rate": 2.973197408103952e-06, "loss": 0.0, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.71875, "completions/mean_terminated_length": 222.63917541503906, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07133907452225685, "epoch": 0.29112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.972866270716835e-06, "loss": 0.0, "num_tokens": 164339869.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0736643448472023, "epoch": 0.2912, "grad_norm": 0.0, "learning_rate": 2.972535047741994e-06, "loss": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.8203125, "completions/mean_terminated_length": 205.9666748046875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06183156557381153, "epoch": 0.29128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.972203739202613e-06, "loss": 0.0, "num_tokens": 164433670.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06488382816314697, "epoch": 0.29136, "grad_norm": 0.0, "learning_rate": 2.971872345121879e-06, "loss": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.3515625, "completions/mean_terminated_length": 213.36363220214844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06319411844015121, "epoch": 0.29144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9715408655229863e-06, "loss": 0.0, "num_tokens": 164528691.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06631550937891006, "epoch": 0.29152, "grad_norm": 0.0, "learning_rate": 2.9712093004291347e-06, "loss": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.203125, "completions/mean_terminated_length": 222.5894775390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06363625451922417, "epoch": 0.2916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9708776498635305e-06, "loss": 0.0, "num_tokens": 164623821.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06434430927038193, "epoch": 0.29168, "grad_norm": 0.0, "learning_rate": 2.970545913849386e-06, "loss": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.296875, "completions/mean_terminated_length": 233.07272338867188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06168545410037041, "epoch": 0.29176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9702140924099185e-06, "loss": 0.0, "num_tokens": 164719603.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06222455948591232, "epoch": 0.29184, "grad_norm": 0.0, "learning_rate": 2.969882185568353e-06, "loss": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.8359375, "completions/mean_terminated_length": 238.1374969482422, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06369336694478989, "epoch": 0.29192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9695501933479188e-06, "loss": 0.0, "num_tokens": 164816478.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061149295419454575, "epoch": 0.292, "grad_norm": 0.0, "learning_rate": 2.9692181157718516e-06, "loss": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.3046875, "completions/mean_terminated_length": 210.31944274902344, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06646094843745232, "epoch": 0.29208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9688859528633937e-06, "loss": 0.0, "num_tokens": 164911493.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06571527570486069, "epoch": 0.29216, "grad_norm": 0.0, "learning_rate": 2.968553704645793e-06, "loss": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.515625, "completions/mean_terminated_length": 224.02040100097656, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06560425460338593, "epoch": 0.29224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9682213711423036e-06, "loss": 0.0, "num_tokens": 165006663.0, "reward": 0.45364314317703247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45364314317703247, "rewards/reward_fn/std": 0.9880856871604919, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06647749617695808, "epoch": 0.29232, "grad_norm": 0.0, "learning_rate": 2.967888952376185e-06, "loss": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.796875, "completions/mean_terminated_length": 236.11764526367188, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07358233630657196, "epoch": 0.2924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9675564483707024e-06, "loss": 0.0, "num_tokens": 165103277.0, "reward": 0.017386555671691895, "reward_std": 0.0, "rewards/reward_fn/mean": 0.017386555671691895, "rewards/reward_fn/std": 0.04618125036358833, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032237946987152, "epoch": 0.29248, "grad_norm": 0.0, "learning_rate": 2.967223859149129e-06, "loss": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 181.79815673828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06085215508937836, "epoch": 0.29256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9668911847347415e-06, "loss": 0.0, "num_tokens": 165193493.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06443792022764683, "epoch": 0.29264, "grad_norm": 0.0, "learning_rate": 2.9665584251508243e-06, "loss": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.5390625, "completions/mean_terminated_length": 195.61111450195312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.06960014253854752, "epoch": 0.29272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9662255804206663e-06, "loss": 0.0, "num_tokens": 165286362.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07582449540495872, "epoch": 0.2928, "grad_norm": 0.0, "learning_rate": 2.9658926505675634e-06, "loss": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.5859375, "completions/mean_terminated_length": 241.8235321044922, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.07370822504162788, "epoch": 0.29288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.965559635614817e-06, "loss": 0.0, "num_tokens": 165383461.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07405759394168854, "epoch": 0.29296, "grad_norm": 0.0, "learning_rate": 2.9652265355857354e-06, "loss": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 209.78125, "completions/mean_terminated_length": 202.7027130126953, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06992083415389061, "epoch": 0.29304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.964893350503631e-06, "loss": 0.0, "num_tokens": 165475849.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07088755816221237, "epoch": 0.29312, "grad_norm": 0.0, "learning_rate": 2.9645600803918235e-06, "loss": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 195.546875, "completions/mean_terminated_length": 183.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.059101587161421776, "epoch": 0.2932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9642267252736393e-06, "loss": 0.0, "num_tokens": 165566415.0, "reward": 0.8077646493911743, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8077646493911743, "rewards/reward_fn/std": 1.2794528007507324, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05890479311347008, "epoch": 0.29328, "grad_norm": 0.0, "learning_rate": 2.9638932851724087e-06, "loss": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.59375, "completions/mean_terminated_length": 225.48934936523438, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.0660485029220581, "epoch": 0.29336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.963559760111469e-06, "loss": 0.0, "num_tokens": 165661851.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06703602522611618, "epoch": 0.29344, "grad_norm": 0.0, "learning_rate": 2.9632261501141644e-06, "loss": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.9921875, "completions/mean_terminated_length": 235.11956787109375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06977498531341553, "epoch": 0.29352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.962892455203843e-06, "loss": 0.0, "num_tokens": 165758234.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0702858604490757, "epoch": 0.2936, "grad_norm": 0.0, "learning_rate": 2.96255867540386e-06, "loss": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.515625, "completions/mean_terminated_length": 213.13999938964844, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06075158715248108, "epoch": 0.29368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9622248107375774e-06, "loss": 0.0, "num_tokens": 165852252.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05890806019306183, "epoch": 0.29376, "grad_norm": 0.0, "learning_rate": 2.9618908612283607e-06, "loss": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.578125, "completions/mean_terminated_length": 223.7142791748047, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07392643019556999, "epoch": 0.29384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9615568268995846e-06, "loss": 0.0, "num_tokens": 165948070.0, "reward": 0.415934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.415934681892395, "rewards/reward_fn/std": 0.9864102602005005, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0691768154501915, "epoch": 0.29392, "grad_norm": 0.0, "learning_rate": 2.9612227077746265e-06, "loss": 0.0, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.7890625, "completions/mean_terminated_length": 205.71817016601562, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06780123710632324, "epoch": 0.294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9608885038768716e-06, "loss": 0.0, "num_tokens": 166040843.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06824066117405891, "epoch": 0.29408, "grad_norm": 0.0, "learning_rate": 2.9605542152297112e-06, "loss": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.7421875, "completions/mean_terminated_length": 233.4591827392578, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06537842005491257, "epoch": 0.29416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9602198418565417e-06, "loss": 0.0, "num_tokens": 166136938.0, "reward": 0.0993937999010086, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0993937999010086, "rewards/reward_fn/std": 0.24206724762916565, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06758490204811096, "epoch": 0.29424, "grad_norm": 0.0, "learning_rate": 2.959885383780765e-06, "loss": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.515625, "completions/mean_terminated_length": 225.58535766601562, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06862872838973999, "epoch": 0.29432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.959550841025791e-06, "loss": 0.0, "num_tokens": 166232748.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06893529742956161, "epoch": 0.2944, "grad_norm": 0.0, "learning_rate": 2.9592162136150334e-06, "loss": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.6015625, "completions/mean_terminated_length": 216.14773559570312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.05997759476304054, "epoch": 0.29448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.958881501571912e-06, "loss": 0.0, "num_tokens": 166327545.0, "reward": 0.8678128719329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8678128719329834, "rewards/reward_fn/std": 1.2730282545089722, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06280234456062317, "epoch": 0.29456, "grad_norm": 0.0, "learning_rate": 2.9585467049198538e-06, "loss": 0.0, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.3203125, "completions/mean_terminated_length": 220.90000915527344, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06284244917333126, "epoch": 0.29464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9582118236822915e-06, "loss": 0.0, "num_tokens": 166422690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06628677621483803, "epoch": 0.29472, "grad_norm": 0.0, "learning_rate": 2.9578768578826624e-06, "loss": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.6953125, "completions/mean_terminated_length": 229.1643829345703, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06662895530462265, "epoch": 0.2948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9575418075444103e-06, "loss": 0.0, "num_tokens": 166519035.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06785360351204872, "epoch": 0.29488, "grad_norm": 0.0, "learning_rate": 2.957206672690986e-06, "loss": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 200.671875, "completions/mean_terminated_length": 190.42593383789062, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06630145758390427, "epoch": 0.29496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9568714533458454e-06, "loss": 0.0, "num_tokens": 166610257.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06935492157936096, "epoch": 0.29504, "grad_norm": 0.0, "learning_rate": 2.95653614953245e-06, "loss": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.953125, "completions/mean_terminated_length": 221.25233459472656, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07558005303144455, "epoch": 0.29512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.956200761274268e-06, "loss": 0.0, "num_tokens": 166704843.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07389827445149422, "epoch": 0.2952, "grad_norm": 0.0, "learning_rate": 2.9558652885947715e-06, "loss": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.1953125, "completions/mean_terminated_length": 233.10606384277344, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.07101614773273468, "epoch": 0.29528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.955529731517442e-06, "loss": 0.0, "num_tokens": 166801636.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06917402893304825, "epoch": 0.29536, "grad_norm": 0.0, "learning_rate": 2.9551940900657637e-06, "loss": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.40625, "completions/mean_terminated_length": 203.9310302734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06276893243193626, "epoch": 0.29544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9548583642632282e-06, "loss": 0.0, "num_tokens": 166896920.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0646829828619957, "epoch": 0.29552, "grad_norm": 0.0, "learning_rate": 2.9545225541333337e-06, "loss": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 239.47540283203125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06869996339082718, "epoch": 0.2956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9541866596995815e-06, "loss": 0.0, "num_tokens": 166994216.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06875809654593468, "epoch": 0.29568, "grad_norm": 0.0, "learning_rate": 2.9538506809854826e-06, "loss": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 200.13954162597656, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.0710008516907692, "epoch": 0.29576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9535146180145504e-06, "loss": 0.0, "num_tokens": 167087716.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06569424644112587, "epoch": 0.29584, "grad_norm": 0.0, "learning_rate": 2.953178470810307e-06, "loss": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.5234375, "completions/mean_terminated_length": 228.34327697753906, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06487605720758438, "epoch": 0.29592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9528422393962784e-06, "loss": 0.0, "num_tokens": 167184167.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0619845949113369, "epoch": 0.296, "grad_norm": 0.0, "learning_rate": 2.9525059237959976e-06, "loss": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.6171875, "completions/mean_terminated_length": 201.73170471191406, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.0644795298576355, "epoch": 0.29608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.952169524033003e-06, "loss": 0.0, "num_tokens": 167280246.0, "reward": 0.45377030968666077, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45377030968666077, "rewards/reward_fn/std": 0.9766225814819336, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06442930549383163, "epoch": 0.29616, "grad_norm": 0.0, "learning_rate": 2.951833040130839e-06, "loss": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.1875, "completions/mean_terminated_length": 208.3000030517578, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.08152960613369942, "epoch": 0.29624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9514964721130555e-06, "loss": 0.0, "num_tokens": 167374734.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07592878490686417, "epoch": 0.29632, "grad_norm": 0.0, "learning_rate": 2.9511598200032096e-06, "loss": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.84375, "completions/mean_terminated_length": 216.24691772460938, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06838618963956833, "epoch": 0.2964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.950823083824863e-06, "loss": 0.0, "num_tokens": 167469818.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06926407292485237, "epoch": 0.29648, "grad_norm": 0.0, "learning_rate": 2.9504862636015833e-06, "loss": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.5859375, "completions/mean_terminated_length": 215.94680786132812, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07000597938895226, "epoch": 0.29656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9501493593569447e-06, "loss": 0.0, "num_tokens": 167564357.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0719817541539669, "epoch": 0.29664, "grad_norm": 0.0, "learning_rate": 2.9498123711145275e-06, "loss": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.0625, "completions/mean_terminated_length": 214.3157958984375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07059577107429504, "epoch": 0.29672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9494752988979158e-06, "loss": 0.0, "num_tokens": 167658701.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06756538152694702, "epoch": 0.2968, "grad_norm": 0.0, "learning_rate": 2.949138142730703e-06, "loss": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.1484375, "completions/mean_terminated_length": 205.99020385742188, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07285065203905106, "epoch": 0.29688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.948800902636485e-06, "loss": 0.0, "num_tokens": 167751904.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07294640317559242, "epoch": 0.29696, "grad_norm": 0.0, "learning_rate": 2.9484635786388653e-06, "loss": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.4140625, "completions/mean_terminated_length": 219.4122772216797, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06416979245841503, "epoch": 0.29704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.948126170761454e-06, "loss": 0.0, "num_tokens": 167846037.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06423895806074142, "epoch": 0.29712, "grad_norm": 0.0, "learning_rate": 2.9477886790278645e-06, "loss": 0.0, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.671875, "completions/mean_terminated_length": 190.92929077148438, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07211339473724365, "epoch": 0.2972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.947451103461719e-06, "loss": 0.0, "num_tokens": 167937899.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07937818020582199, "epoch": 0.29728, "grad_norm": 0.0, "learning_rate": 2.947113444086644e-06, "loss": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.1875, "completions/mean_terminated_length": 235.49090576171875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.0664697103202343, "epoch": 0.29736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.946775700926271e-06, "loss": 0.0, "num_tokens": 168035075.0, "reward": 0.06713119894266129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06713119894266129, "rewards/reward_fn/std": 0.1783103495836258, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06361545622348785, "epoch": 0.29744, "grad_norm": 0.0, "learning_rate": 2.9464378740042396e-06, "loss": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.5703125, "completions/mean_terminated_length": 237.39654541015625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06369857490062714, "epoch": 0.29752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9460999633441944e-06, "loss": 0.0, "num_tokens": 168132300.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06615232676267624, "epoch": 0.2976, "grad_norm": 0.0, "learning_rate": 2.945761968969784e-06, "loss": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.28125, "completions/mean_terminated_length": 211.9394073486328, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07153968513011932, "epoch": 0.29768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9454238909046658e-06, "loss": 0.0, "num_tokens": 168227696.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06822090595960617, "epoch": 0.29776, "grad_norm": 0.0, "learning_rate": 2.9450857291725014e-06, "loss": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.890625, "completions/mean_terminated_length": 235.6857147216797, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06434464640915394, "epoch": 0.29784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.944747483796959e-06, "loss": 0.0, "num_tokens": 168324578.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06576817482709885, "epoch": 0.29792, "grad_norm": 0.0, "learning_rate": 2.944409154801711e-06, "loss": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.9453125, "completions/mean_terminated_length": 226.0166778564453, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07491834834218025, "epoch": 0.298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9440707422104376e-06, "loss": 0.0, "num_tokens": 168421083.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07968207448720932, "epoch": 0.29808, "grad_norm": 0.0, "learning_rate": 2.943732246046824e-06, "loss": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.5546875, "completions/mean_terminated_length": 208.9560546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.08217087015509605, "epoch": 0.29816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9433936663345612e-06, "loss": 0.0, "num_tokens": 168515106.0, "reward": 1.1349787712097168, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1349787712097168, "rewards/reward_fn/std": 1.450537085533142, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08368504792451859, "epoch": 0.29824, "grad_norm": 0.0, "learning_rate": 2.9430550030973467e-06, "loss": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.84375, "completions/mean_terminated_length": 234.41026306152344, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06499318405985832, "epoch": 0.29832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9427162563588827e-06, "loss": 0.0, "num_tokens": 168611726.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06738973781466484, "epoch": 0.2984, "grad_norm": 0.0, "learning_rate": 2.942377426142879e-06, "loss": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.890625, "completions/mean_terminated_length": 225.03797912597656, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.067019272595644, "epoch": 0.29848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9420385124730483e-06, "loss": 0.0, "num_tokens": 168707584.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07091080397367477, "epoch": 0.29856, "grad_norm": 0.0, "learning_rate": 2.9416995153731127e-06, "loss": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.2734375, "completions/mean_terminated_length": 234.80197143554688, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07321805879473686, "epoch": 0.29864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9413604348667974e-06, "loss": 0.0, "num_tokens": 168803747.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07435904070734978, "epoch": 0.29872, "grad_norm": 0.0, "learning_rate": 2.941021270977835e-06, "loss": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.84375, "completions/mean_terminated_length": 222.56790161132812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07389359921216965, "epoch": 0.2988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.940682023729963e-06, "loss": 0.0, "num_tokens": 168899343.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07618671655654907, "epoch": 0.29888, "grad_norm": 0.0, "learning_rate": 2.9403426931469257e-06, "loss": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.3828125, "completions/mean_terminated_length": 211.61972045898438, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07565126568078995, "epoch": 0.29896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.940003279252472e-06, "loss": 0.0, "num_tokens": 168994496.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07584676891565323, "epoch": 0.29904, "grad_norm": 0.0, "learning_rate": 2.939663782070358e-06, "loss": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.1171875, "completions/mean_terminated_length": 207.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06365224719047546, "epoch": 0.29912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.939324201624344e-06, "loss": 0.0, "num_tokens": 169089615.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06422067433595657, "epoch": 0.2992, "grad_norm": 0.0, "learning_rate": 2.9389845379381973e-06, "loss": 0.0, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 188.87911987304688, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06459594517946243, "epoch": 0.29928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.938644791035691e-06, "loss": 0.0, "num_tokens": 169181811.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06215308792889118, "epoch": 0.29936, "grad_norm": 0.0, "learning_rate": 2.9383049609406043e-06, "loss": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 205.203125, "completions/mean_terminated_length": 195.233642578125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08115194737911224, "epoch": 0.29944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9379650476767213e-06, "loss": 0.0, "num_tokens": 169273613.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08341438695788383, "epoch": 0.29952, "grad_norm": 0.0, "learning_rate": 2.9376250512678318e-06, "loss": 0.0, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.078125, "completions/mean_terminated_length": 209.13925170898438, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06722818315029144, "epoch": 0.2996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9372849717377323e-06, "loss": 0.0, "num_tokens": 169368215.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06358270347118378, "epoch": 0.29968, "grad_norm": 0.0, "learning_rate": 2.9369448091102253e-06, "loss": 0.0, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 224.83116149902344, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07011310383677483, "epoch": 0.29976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9366045634091183e-06, "loss": 0.0, "num_tokens": 169464119.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07121771201491356, "epoch": 0.29984, "grad_norm": 0.0, "learning_rate": 2.936264234658224e-06, "loss": 0.0, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 197.8984375, "completions/mean_terminated_length": 183.79611206054688, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07217619568109512, "epoch": 0.29992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.935923822881363e-06, "loss": 0.0, "num_tokens": 169554986.0, "reward": 0.43495213985443115, "reward_std": 0.0, "rewards/reward_fn/mean": 0.43495213985443115, "rewards/reward_fn/std": 0.9811339974403381, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07382635027170181, "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 2.9355833281023605e-06, "loss": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.421875, "completions/mean_terminated_length": 199.25640869140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07105176150798798, "epoch": 0.30008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.935242750345047e-06, "loss": 0.0, "num_tokens": 169648864.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0684434026479721, "epoch": 0.30016, "grad_norm": 0.0, "learning_rate": 2.934902089633259e-06, "loss": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7890625, "completions/mean_terminated_length": 231.9629669189453, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06731565296649933, "epoch": 0.30024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9345613459908398e-06, "loss": 0.0, "num_tokens": 169745221.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07011070847511292, "epoch": 0.30032, "grad_norm": 0.0, "learning_rate": 2.934220519441638e-06, "loss": 0.0, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.4375, "completions/mean_terminated_length": 205.9178009033203, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07595058158040047, "epoch": 0.3004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.933879610009508e-06, "loss": 0.0, "num_tokens": 169839869.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07170072197914124, "epoch": 0.30048, "grad_norm": 0.0, "learning_rate": 2.9335386177183087e-06, "loss": 0.0, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.765625, "completions/mean_terminated_length": 211.28570556640625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06949944794178009, "epoch": 0.30056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.933197542591907e-06, "loss": 0.0, "num_tokens": 169933791.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07430214062333107, "epoch": 0.30064, "grad_norm": 0.0, "learning_rate": 2.932856384654174e-06, "loss": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.7265625, "completions/mean_terminated_length": 202.74038696289062, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.0679311528801918, "epoch": 0.30072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.932515143928988e-06, "loss": 0.0, "num_tokens": 170026556.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0676242932677269, "epoch": 0.3008, "grad_norm": 0.0, "learning_rate": 2.932173820440231e-06, "loss": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.21875, "completions/mean_terminated_length": 209.5211181640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06629042327404022, "epoch": 0.30088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9318324142117926e-06, "loss": 0.0, "num_tokens": 170121560.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06505771912634373, "epoch": 0.30096, "grad_norm": 0.0, "learning_rate": 2.9314909252675678e-06, "loss": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.765625, "completions/mean_terminated_length": 238.9870147705078, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07228207588195801, "epoch": 0.30104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.931149353631458e-06, "loss": 0.0, "num_tokens": 170218554.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07302269339561462, "epoch": 0.30112, "grad_norm": 0.0, "learning_rate": 2.930807699327368e-06, "loss": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.765625, "completions/mean_terminated_length": 196.9176483154297, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0672704428434372, "epoch": 0.3012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.930465962379211e-06, "loss": 0.0, "num_tokens": 170311836.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06667788326740265, "epoch": 0.30128, "grad_norm": 0.0, "learning_rate": 2.9301241428109043e-06, "loss": 0.0, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.78125, "completions/mean_terminated_length": 225.80821228027344, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06929346919059753, "epoch": 0.30136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.929782240646372e-06, "loss": 0.0, "num_tokens": 170407936.0, "reward": 0.10670649260282516, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10670649260282516, "rewards/reward_fn/std": 0.2759184241294861, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06828169524669647, "epoch": 0.30144, "grad_norm": 0.0, "learning_rate": 2.929440255909544e-06, "loss": 0.0, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.1484375, "completions/mean_terminated_length": 217.52325439453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06960970908403397, "epoch": 0.30152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9290981886243554e-06, "loss": 0.0, "num_tokens": 170502931.0, "reward": 0.4759461283683777, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4759461283683777, "rewards/reward_fn/std": 0.9938373565673828, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805434450507164, "epoch": 0.3016, "grad_norm": 0.0, "learning_rate": 2.9287560388147472e-06, "loss": 0.0, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.0546875, "completions/mean_terminated_length": 241.12986755371094, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.07146518304944038, "epoch": 0.30168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9284138065046666e-06, "loss": 0.0, "num_tokens": 170600090.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07306281477212906, "epoch": 0.30176, "grad_norm": 0.0, "learning_rate": 2.928071491718065e-06, "loss": 0.0, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.1640625, "completions/mean_terminated_length": 217.5465087890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07398320361971855, "epoch": 0.30184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.927729094478903e-06, "loss": 0.0, "num_tokens": 170695087.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07250401005148888, "epoch": 0.30192, "grad_norm": 0.0, "learning_rate": 2.9273866148111425e-06, "loss": 0.0, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.6796875, "completions/mean_terminated_length": 235.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.0673106387257576, "epoch": 0.302, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.927044052738755e-06, "loss": 0.0, "num_tokens": 170791942.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06863601878285408, "epoch": 0.30208, "grad_norm": 0.0, "learning_rate": 2.9267014082857155e-06, "loss": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.796875, "completions/mean_terminated_length": 236.7941131591797, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06785988807678223, "epoch": 0.30216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9263586814760062e-06, "loss": 0.0, "num_tokens": 170888940.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06935873627662659, "epoch": 0.30224, "grad_norm": 0.0, "learning_rate": 2.9260158723336132e-06, "loss": 0.0, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.234375, "completions/mean_terminated_length": 226.2353057861328, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07236066833138466, "epoch": 0.30232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.925672980882531e-06, "loss": 0.0, "num_tokens": 170984714.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07208080589771271, "epoch": 0.3024, "grad_norm": 0.0, "learning_rate": 2.9253300071467572e-06, "loss": 0.0, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.109375, "completions/mean_terminated_length": 219.62857055664062, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.07419179379940033, "epoch": 0.30248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.924986951150297e-06, "loss": 0.0, "num_tokens": 171080472.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0714789368212223, "epoch": 0.30256, "grad_norm": 0.0, "learning_rate": 2.9246438129171597e-06, "loss": 0.0, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.9140625, "completions/mean_terminated_length": 237.57142639160156, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.07350168004631996, "epoch": 0.30264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9243005924713625e-06, "loss": 0.0, "num_tokens": 171177357.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07443655654788017, "epoch": 0.30272, "grad_norm": 0.0, "learning_rate": 2.9239572898369273e-06, "loss": 0.0, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.9296875, "completions/mean_terminated_length": 228.57333374023438, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.0735112801194191, "epoch": 0.3028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9236139050378808e-06, "loss": 0.0, "num_tokens": 171273604.0, "reward": 0.4571110010147095, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4571110010147095, "rewards/reward_fn/std": 0.9856017231941223, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07443603500723839, "epoch": 0.30288, "grad_norm": 0.0, "learning_rate": 2.9232704380982564e-06, "loss": 0.0, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 243.41334533691406, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06748945266008377, "epoch": 0.30296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.922926889042093e-06, "loss": 0.0, "num_tokens": 171370964.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06698717549443245, "epoch": 0.30304, "grad_norm": 0.0, "learning_rate": 2.9225832578934363e-06, "loss": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.0703125, "completions/mean_terminated_length": 221.52703857421875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07428932189941406, "epoch": 0.30312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9222395446763368e-06, "loss": 0.0, "num_tokens": 171466717.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07303862646222115, "epoch": 0.3032, "grad_norm": 0.0, "learning_rate": 2.92189574941485e-06, "loss": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.0390625, "completions/mean_terminated_length": 233.38937377929688, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06033913418650627, "epoch": 0.30328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9215518721330384e-06, "loss": 0.0, "num_tokens": 171562466.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06255566701292992, "epoch": 0.30336, "grad_norm": 0.0, "learning_rate": 2.9212079128549696e-06, "loss": 0.0, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 197.09375, "completions/mean_terminated_length": 181.34652709960938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06118759140372276, "epoch": 0.30344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.920863871604717e-06, "loss": 0.0, "num_tokens": 171653230.0, "reward": 0.8749256134033203, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8749256134033203, "rewards/reward_fn/std": 1.27358877658844, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06018119864165783, "epoch": 0.30352, "grad_norm": 0.0, "learning_rate": 2.92051974840636e-06, "loss": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.046875, "completions/mean_terminated_length": 223.1428680419922, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07868286967277527, "epoch": 0.3036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.920175543283984e-06, "loss": 0.0, "num_tokens": 171748084.0, "reward": 0.08158833533525467, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08158833533525467, "rewards/reward_fn/std": 0.21671062707901, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07492172345519066, "epoch": 0.30368, "grad_norm": 0.0, "learning_rate": 2.9198312562616793e-06, "loss": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.390625, "completions/mean_terminated_length": 234.04396057128906, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06034231558442116, "epoch": 0.30376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.919486887363542e-06, "loss": 0.0, "num_tokens": 171844390.0, "reward": 0.12499324977397919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12499324977397919, "rewards/reward_fn/std": 0.3320004642009735, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06310850009322166, "epoch": 0.30384, "grad_norm": 0.0, "learning_rate": 2.919142436613675e-06, "loss": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.953125, "completions/mean_terminated_length": 237.59182739257812, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06625379249453545, "epoch": 0.30392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.918797904036186e-06, "loss": 0.0, "num_tokens": 171941792.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06179812178015709, "epoch": 0.304, "grad_norm": 0.0, "learning_rate": 2.918453289655189e-06, "loss": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.9609375, "completions/mean_terminated_length": 223.2659454345703, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.08064639940857887, "epoch": 0.30408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9181085934948017e-06, "loss": 0.0, "num_tokens": 172037019.0, "reward": 0.0994907021522522, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0994907021522522, "rewards/reward_fn/std": 0.22879931330680847, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07982491701841354, "epoch": 0.30416, "grad_norm": 0.0, "learning_rate": 2.9177638155791515e-06, "loss": 0.0, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.015625, "completions/mean_terminated_length": 208.6599884033203, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07917836681008339, "epoch": 0.30424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9174189559323673e-06, "loss": 0.0, "num_tokens": 172130589.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07710766419768333, "epoch": 0.30432, "grad_norm": 0.0, "learning_rate": 2.9170740145785874e-06, "loss": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.234375, "completions/mean_terminated_length": 217.1999969482422, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06584057956933975, "epoch": 0.3044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.916728991541952e-06, "loss": 0.0, "num_tokens": 172225595.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0631442703306675, "epoch": 0.30448, "grad_norm": 0.0, "learning_rate": 2.9163838868466115e-06, "loss": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.15625, "completions/mean_terminated_length": 239.4761962890625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.06544116511940956, "epoch": 0.30456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.916038700516717e-06, "loss": 0.0, "num_tokens": 172322511.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06330897100269794, "epoch": 0.30464, "grad_norm": 0.0, "learning_rate": 2.9156934325764303e-06, "loss": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 251.4609375, "completions/mean_terminated_length": 243.08889770507812, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.0660407580435276, "epoch": 0.30472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9153480830499147e-06, "loss": 0.0, "num_tokens": 172420234.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06758039444684982, "epoch": 0.3048, "grad_norm": 0.0, "learning_rate": 2.9150026519613418e-06, "loss": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.4921875, "completions/mean_terminated_length": 221.79452514648438, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07181563228368759, "epoch": 0.30488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.914657139334888e-06, "loss": 0.0, "num_tokens": 172516041.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07167913392186165, "epoch": 0.30496, "grad_norm": 0.0, "learning_rate": 2.9143115451947355e-06, "loss": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.765625, "completions/mean_terminated_length": 214.543212890625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0645349957048893, "epoch": 0.30504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9139658695650723e-06, "loss": 0.0, "num_tokens": 172610987.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06497170031070709, "epoch": 0.30512, "grad_norm": 0.0, "learning_rate": 2.913620112470092e-06, "loss": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.7265625, "completions/mean_terminated_length": 225.01087951660156, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07967335358262062, "epoch": 0.3052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.913274273933994e-06, "loss": 0.0, "num_tokens": 172706440.0, "reward": 0.4701992869377136, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4701992869377136, "rewards/reward_fn/std": 0.9920202493667603, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08273546397686005, "epoch": 0.30528, "grad_norm": 0.0, "learning_rate": 2.912928353980983e-06, "loss": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.2421875, "completions/mean_terminated_length": 222.3833465576172, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07073582708835602, "epoch": 0.30536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.912582352635271e-06, "loss": 0.0, "num_tokens": 172802727.0, "reward": 0.019831063225865364, "reward_std": 0.0, "rewards/reward_fn/mean": 0.019831063225865364, "rewards/reward_fn/std": 0.052674222737550735, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06936705112457275, "epoch": 0.30544, "grad_norm": 0.0, "learning_rate": 2.912236269921073e-06, "loss": 0.0, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 219.5294189453125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06648868694901466, "epoch": 0.30552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9118901058626116e-06, "loss": 0.0, "num_tokens": 172898551.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07262048125267029, "epoch": 0.3056, "grad_norm": 0.0, "learning_rate": 2.9115438604841148e-06, "loss": 0.0, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.453125, "completions/mean_terminated_length": 216.90625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06411679834127426, "epoch": 0.30568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.911197533809815e-06, "loss": 0.0, "num_tokens": 172994353.0, "reward": 0.06162349507212639, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06162349507212639, "rewards/reward_fn/std": 0.16368108987808228, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06677672266960144, "epoch": 0.30576, "grad_norm": 0.0, "learning_rate": 2.9108511258639537e-06, "loss": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.0078125, "completions/mean_terminated_length": 206.31765747070312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07246454805135727, "epoch": 0.30584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.910504636670774e-06, "loss": 0.0, "num_tokens": 173088434.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07012911140918732, "epoch": 0.30592, "grad_norm": 0.0, "learning_rate": 2.9101580662545273e-06, "loss": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.4921875, "completions/mean_terminated_length": 216.06930541992188, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.06692333519458771, "epoch": 0.306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.909811414639469e-06, "loss": 0.0, "num_tokens": 173182705.0, "reward": 0.09440245479345322, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09440245479345322, "rewards/reward_fn/std": 0.2432476133108139, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06690672039985657, "epoch": 0.30608, "grad_norm": 0.0, "learning_rate": 2.909464681849862e-06, "loss": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.0234375, "completions/mean_terminated_length": 237.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06812684237957001, "epoch": 0.30616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9091178679099734e-06, "loss": 0.0, "num_tokens": 173279476.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06824499368667603, "epoch": 0.30624, "grad_norm": 0.0, "learning_rate": 2.9087709728440765e-06, "loss": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.078125, "completions/mean_terminated_length": 237.20455932617188, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.0718885026872158, "epoch": 0.30632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9084239966764504e-06, "loss": 0.0, "num_tokens": 173376126.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07086405903100967, "epoch": 0.3064, "grad_norm": 0.0, "learning_rate": 2.90807693943138e-06, "loss": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.59375, "completions/mean_terminated_length": 226.91525268554688, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06791194900870323, "epoch": 0.30648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9077298011331554e-06, "loss": 0.0, "num_tokens": 173472714.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07353232055902481, "epoch": 0.30656, "grad_norm": 0.0, "learning_rate": 2.9073825818060727e-06, "loss": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0078125, "completions/mean_terminated_length": 228.97183227539062, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06467451713979244, "epoch": 0.30664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9070352814744336e-06, "loss": 0.0, "num_tokens": 173569099.0, "reward": 0.48566895723342896, "reward_std": 0.0, "rewards/reward_fn/mean": 0.48566895723342896, "rewards/reward_fn/std": 0.9974362850189209, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06318848207592964, "epoch": 0.30672, "grad_norm": 0.0, "learning_rate": 2.9066879001625454e-06, "loss": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.6953125, "completions/mean_terminated_length": 202.59596252441406, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07297943904995918, "epoch": 0.3068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.906340437894721e-06, "loss": 0.0, "num_tokens": 173662116.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0754658505320549, "epoch": 0.30688, "grad_norm": 0.0, "learning_rate": 2.905992894695279e-06, "loss": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.0390625, "completions/mean_terminated_length": 218.07693481445312, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07043725997209549, "epoch": 0.30696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9056452705885446e-06, "loss": 0.0, "num_tokens": 173756969.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07046506553888321, "epoch": 0.30704, "grad_norm": 0.0, "learning_rate": 2.9052975655988467e-06, "loss": 0.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.4296875, "completions/mean_terminated_length": 232.8400115966797, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06333938241004944, "epoch": 0.30712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9049497797505213e-06, "loss": 0.0, "num_tokens": 173853536.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06346738710999489, "epoch": 0.3072, "grad_norm": 0.0, "learning_rate": 2.90460191306791e-06, "loss": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.6015625, "completions/mean_terminated_length": 224.13829040527344, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07225881889462471, "epoch": 0.30728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9042539655753603e-06, "loss": 0.0, "num_tokens": 173948845.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07212138921022415, "epoch": 0.30736, "grad_norm": 0.0, "learning_rate": 2.903905937297224e-06, "loss": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.21875, "completions/mean_terminated_length": 207.2093048095703, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06810344010591507, "epoch": 0.30744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.903557828257859e-06, "loss": 0.0, "num_tokens": 174042953.0, "reward": 0.08953723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08953723311424255, "rewards/reward_fn/std": 0.23782405257225037, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0647701621055603, "epoch": 0.30752, "grad_norm": 0.0, "learning_rate": 2.90320963848163e-06, "loss": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.453125, "completions/mean_terminated_length": 235.0786590576172, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06912655383348465, "epoch": 0.3076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9028613679929076e-06, "loss": 0.0, "num_tokens": 174139395.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06673432514071465, "epoch": 0.30768, "grad_norm": 0.0, "learning_rate": 2.9025130168160645e-06, "loss": 0.0, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6796875, "completions/mean_terminated_length": 237.64614868164062, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06683893129229546, "epoch": 0.30776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9021645849754836e-06, "loss": 0.0, "num_tokens": 174236506.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06740356236696243, "epoch": 0.30784, "grad_norm": 0.0, "learning_rate": 2.901816072495551e-06, "loss": 0.0, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 197.8181915283203, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06607334315776825, "epoch": 0.30792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.901467479400659e-06, "loss": 0.0, "num_tokens": 174329690.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06677348911762238, "epoch": 0.308, "grad_norm": 0.0, "learning_rate": 2.9011188057152054e-06, "loss": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1875, "completions/mean_terminated_length": 227.2280731201172, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.07641631364822388, "epoch": 0.30808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.900770051463593e-06, "loss": 0.0, "num_tokens": 174426354.0, "reward": 0.46803462505340576, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46803462505340576, "rewards/reward_fn/std": 0.9913958311080933, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07677675783634186, "epoch": 0.30816, "grad_norm": 0.0, "learning_rate": 2.9004212166702314e-06, "loss": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.9453125, "completions/mean_terminated_length": 238.4394073486328, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06967191770672798, "epoch": 0.30824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.9000723013595356e-06, "loss": 0.0, "num_tokens": 174523499.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06505143269896507, "epoch": 0.30832, "grad_norm": 0.0, "learning_rate": 2.8997233055559256e-06, "loss": 0.0, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.03125, "completions/mean_terminated_length": 230.1428680419922, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07504313439130783, "epoch": 0.3084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.899374229283828e-06, "loss": 0.0, "num_tokens": 174619631.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07839470356702805, "epoch": 0.30848, "grad_norm": 0.0, "learning_rate": 2.899025072567674e-06, "loss": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.828125, "completions/mean_terminated_length": 208.61053466796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0748336873948574, "epoch": 0.30856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.898675835431901e-06, "loss": 0.0, "num_tokens": 174713433.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07163579761981964, "epoch": 0.30864, "grad_norm": 0.0, "learning_rate": 2.898326517900951e-06, "loss": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.34375, "completions/mean_terminated_length": 214.39024353027344, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07330505922436714, "epoch": 0.30872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.897977119999274e-06, "loss": 0.0, "num_tokens": 174808325.0, "reward": 0.790934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.790934681892395, "rewards/reward_fn/std": 1.2848050594329834, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143878564238548, "epoch": 0.3088, "grad_norm": 0.0, "learning_rate": 2.897627641751323e-06, "loss": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.046875, "completions/mean_terminated_length": 225.59524536132812, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.0708000436425209, "epoch": 0.30888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.897278083181559e-06, "loss": 0.0, "num_tokens": 174904075.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07699208334088326, "epoch": 0.30896, "grad_norm": 0.0, "learning_rate": 2.8969284443144466e-06, "loss": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.7734375, "completions/mean_terminated_length": 215.07017517089844, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06015411950647831, "epoch": 0.30904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8965787251744563e-06, "loss": 0.0, "num_tokens": 175000046.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.056093256920576096, "epoch": 0.30912, "grad_norm": 0.0, "learning_rate": 2.896228925786066e-06, "loss": 0.0, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 238.7109375, "completions/mean_terminated_length": 232.7052764892578, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06852640584111214, "epoch": 0.3092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8958790461737575e-06, "loss": 0.0, "num_tokens": 175096137.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0689084641635418, "epoch": 0.30928, "grad_norm": 0.0, "learning_rate": 2.895529086362018e-06, "loss": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.3046875, "completions/mean_terminated_length": 200.10293579101562, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07306063175201416, "epoch": 0.30936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8951790463753417e-06, "loss": 0.0, "num_tokens": 175190640.0, "reward": 0.801705539226532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.801705539226532, "rewards/reward_fn/std": 1.2812025547027588, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07420985773205757, "epoch": 0.30944, "grad_norm": 0.0, "learning_rate": 2.8948289262382283e-06, "loss": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 233.19540405273438, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07416121661663055, "epoch": 0.30952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8944787259751808e-06, "loss": 0.0, "num_tokens": 175286960.0, "reward": 0.4489399194717407, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4489399194717407, "rewards/reward_fn/std": 0.9873224496841431, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0797562375664711, "epoch": 0.3096, "grad_norm": 0.0, "learning_rate": 2.894128445610711e-06, "loss": 0.0, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.7734375, "completions/mean_terminated_length": 219.3157958984375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06938839331269264, "epoch": 0.30968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.893778085169334e-06, "loss": 0.0, "num_tokens": 175381779.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07280310243368149, "epoch": 0.30976, "grad_norm": 0.0, "learning_rate": 2.893427644675572e-06, "loss": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 217.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.0720352828502655, "epoch": 0.30984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.893077124153951e-06, "loss": 0.0, "num_tokens": 175477587.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07154642418026924, "epoch": 0.30992, "grad_norm": 0.0, "learning_rate": 2.892726523629005e-06, "loss": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.28125, "completions/mean_terminated_length": 216.84506225585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06946379691362381, "epoch": 0.31, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8923758431252713e-06, "loss": 0.0, "num_tokens": 175573111.0, "reward": 0.11220657825469971, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11220657825469971, "rewards/reward_fn/std": 0.29803720116615295, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07039717957377434, "epoch": 0.31008, "grad_norm": 0.0, "learning_rate": 2.892025082667295e-06, "loss": 0.0, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.3515625, "completions/mean_terminated_length": 188.1267547607422, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.0734618604183197, "epoch": 0.31016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8916742422796248e-06, "loss": 0.0, "num_tokens": 175666596.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06941638886928558, "epoch": 0.31024, "grad_norm": 0.0, "learning_rate": 2.891323321986816e-06, "loss": 0.0, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.484375, "completions/mean_terminated_length": 232.51112365722656, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07391257211565971, "epoch": 0.31032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8909723218134293e-06, "loss": 0.0, "num_tokens": 175762786.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06987176463007927, "epoch": 0.3104, "grad_norm": 0.0, "learning_rate": 2.890621241784031e-06, "loss": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.2890625, "completions/mean_terminated_length": 234.59754943847656, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.0703510232269764, "epoch": 0.31048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.890270081923193e-06, "loss": 0.0, "num_tokens": 175859335.0, "reward": 0.4246163070201874, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4246163070201874, "rewards/reward_fn/std": 0.9858949184417725, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696256086230278, "epoch": 0.31056, "grad_norm": 0.0, "learning_rate": 2.8899188422554923e-06, "loss": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.5390625, "completions/mean_terminated_length": 213.66250610351562, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06961094960570335, "epoch": 0.31064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.889567522805513e-06, "loss": 0.0, "num_tokens": 175954252.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06889902055263519, "epoch": 0.31072, "grad_norm": 0.0, "learning_rate": 2.889216123597844e-06, "loss": 0.0, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.1484375, "completions/mean_terminated_length": 182.61643981933594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06951774656772614, "epoch": 0.3108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8888646446570774e-06, "loss": 0.0, "num_tokens": 176047199.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06687809526920319, "epoch": 0.31088, "grad_norm": 0.0, "learning_rate": 2.888513086007815e-06, "loss": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.9765625, "completions/mean_terminated_length": 212.21519470214844, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07371515035629272, "epoch": 0.31096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8881614476746615e-06, "loss": 0.0, "num_tokens": 176142044.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0693793036043644, "epoch": 0.31104, "grad_norm": 0.0, "learning_rate": 2.8878097296822276e-06, "loss": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.6328125, "completions/mean_terminated_length": 213.795166015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07130537927150726, "epoch": 0.31112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.88745793205513e-06, "loss": 0.0, "num_tokens": 176236845.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07263773679733276, "epoch": 0.3112, "grad_norm": 0.0, "learning_rate": 2.887106054817992e-06, "loss": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.8203125, "completions/mean_terminated_length": 228.8518524169922, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0735943466424942, "epoch": 0.31128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8867540979954393e-06, "loss": 0.0, "num_tokens": 176332950.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07522942125797272, "epoch": 0.31136, "grad_norm": 0.0, "learning_rate": 2.8864020616121064e-06, "loss": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.6875, "completions/mean_terminated_length": 204.0869598388672, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07197476550936699, "epoch": 0.31144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8860499456926318e-06, "loss": 0.0, "num_tokens": 176426478.0, "reward": 0.7698310613632202, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7698310613632202, "rewards/reward_fn/std": 1.293669581413269, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07287287339568138, "epoch": 0.31152, "grad_norm": 0.0, "learning_rate": 2.88569775026166e-06, "loss": 0.0, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.140625, "completions/mean_terminated_length": 209.5294189453125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0598064661026001, "epoch": 0.3116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8853454753438416e-06, "loss": 0.0, "num_tokens": 176520832.0, "reward": 0.12497759610414505, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12497759610414505, "rewards/reward_fn/std": 0.3319588899612427, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057702790945768356, "epoch": 0.31168, "grad_norm": 0.0, "learning_rate": 2.8849931209638303e-06, "loss": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 197.9140625, "completions/mean_terminated_length": 185.19049072265625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.06938068941235542, "epoch": 0.31176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8846406871462887e-06, "loss": 0.0, "num_tokens": 176611701.0, "reward": 0.8611998558044434, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8611998558044434, "rewards/reward_fn/std": 1.2727582454681396, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06934279575943947, "epoch": 0.31184, "grad_norm": 0.0, "learning_rate": 2.884288173915884e-06, "loss": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.28125, "completions/mean_terminated_length": 176.2985076904297, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05918683111667633, "epoch": 0.31192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8839355812972864e-06, "loss": 0.0, "num_tokens": 176704665.0, "reward": 0.7886883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7886883616447449, "rewards/reward_fn/std": 1.2856351137161255, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05849624611437321, "epoch": 0.312, "grad_norm": 0.0, "learning_rate": 2.883582909315175e-06, "loss": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.9140625, "completions/mean_terminated_length": 205.1792449951172, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07503341510891914, "epoch": 0.31208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8832301579942327e-06, "loss": 0.0, "num_tokens": 176797582.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07230965793132782, "epoch": 0.31216, "grad_norm": 0.0, "learning_rate": 2.882877327359149e-06, "loss": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.28125, "completions/mean_terminated_length": 227.4761962890625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07027589157223701, "epoch": 0.31224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8825244174346176e-06, "loss": 0.0, "num_tokens": 176893490.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06706389039754868, "epoch": 0.31232, "grad_norm": 0.0, "learning_rate": 2.8821714282453387e-06, "loss": 0.0, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 231.60000610351562, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06847937777638435, "epoch": 0.3124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.881818359816018e-06, "loss": 0.0, "num_tokens": 176989842.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06683399528265, "epoch": 0.31248, "grad_norm": 0.0, "learning_rate": 2.881465212171366e-06, "loss": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.2421875, "completions/mean_terminated_length": 239.98837280273438, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06507877260446548, "epoch": 0.31256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8811119853360993e-06, "loss": 0.0, "num_tokens": 177086769.0, "reward": 0.08300459384918213, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08300459384918213, "rewards/reward_fn/std": 0.22047241032123566, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0686437226831913, "epoch": 0.31264, "grad_norm": 0.0, "learning_rate": 2.880758679334941e-06, "loss": 0.0, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 207.546875, "completions/mean_terminated_length": 177.49368286132812, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.06284140981733799, "epoch": 0.31272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8804052941926186e-06, "loss": 0.0, "num_tokens": 177178871.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06449505686759949, "epoch": 0.3128, "grad_norm": 0.0, "learning_rate": 2.8800518299338642e-06, "loss": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 251.9296875, "completions/mean_terminated_length": 244.1591033935547, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06564551219344139, "epoch": 0.31288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8796982865834177e-06, "loss": 0.0, "num_tokens": 177276654.0, "reward": 0.5234414935112, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5234414935112, "rewards/reward_fn/std": 0.9779093861579895, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06490187533199787, "epoch": 0.31296, "grad_norm": 0.0, "learning_rate": 2.879344664166023e-06, "loss": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.4765625, "completions/mean_terminated_length": 205.61334228515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07367969304323196, "epoch": 0.31304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8789909627064296e-06, "loss": 0.0, "num_tokens": 177371179.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07356061041355133, "epoch": 0.31312, "grad_norm": 0.0, "learning_rate": 2.8786371822293937e-06, "loss": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.515625, "completions/mean_terminated_length": 237.3924102783203, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06943677738308907, "epoch": 0.3132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8782833227596754e-06, "loss": 0.0, "num_tokens": 177468013.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07163753360509872, "epoch": 0.31328, "grad_norm": 0.0, "learning_rate": 2.877929384322042e-06, "loss": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.1796875, "completions/mean_terminated_length": 198.05050659179688, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06582324951887131, "epoch": 0.31336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8775753669412637e-06, "loss": 0.0, "num_tokens": 177560580.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06241420842707157, "epoch": 0.31344, "grad_norm": 0.0, "learning_rate": 2.87722127064212e-06, "loss": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.453125, "completions/mean_terminated_length": 208.1265869140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0661827102303505, "epoch": 0.31352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.876867095449393e-06, "loss": 0.0, "num_tokens": 177655102.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06905877217650414, "epoch": 0.3136, "grad_norm": 0.0, "learning_rate": 2.876512841387871e-06, "loss": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 206.296875, "completions/mean_terminated_length": 191.08163452148438, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06866485252976418, "epoch": 0.31368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8761585084823483e-06, "loss": 0.0, "num_tokens": 177747044.0, "reward": 0.76492840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.76492840051651, "rewards/reward_fn/std": 1.296067476272583, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06725243106484413, "epoch": 0.31376, "grad_norm": 0.0, "learning_rate": 2.875804096757625e-06, "loss": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.3125, "completions/mean_terminated_length": 204.96551513671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07366889342665672, "epoch": 0.31384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.875449606238505e-06, "loss": 0.0, "num_tokens": 177840908.0, "reward": 0.6424000263214111, "reward_std": 0.0, "rewards/reward_fn/mean": 0.6424000263214111, "rewards/reward_fn/std": 0.9766880869865417, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07758117467164993, "epoch": 0.31392, "grad_norm": 0.0, "learning_rate": 2.8750950369498e-06, "loss": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.734375, "completions/mean_terminated_length": 213.78761291503906, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07051075249910355, "epoch": 0.314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.874740388916326e-06, "loss": 0.0, "num_tokens": 177934442.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06908019632101059, "epoch": 0.31408, "grad_norm": 0.0, "learning_rate": 2.8743856621629037e-06, "loss": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 237.68116760253906, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.0675807073712349, "epoch": 0.31416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.874030856714361e-06, "loss": 0.0, "num_tokens": 178031482.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07129943370819092, "epoch": 0.31424, "grad_norm": 0.0, "learning_rate": 2.8736759725955304e-06, "loss": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.1015625, "completions/mean_terminated_length": 212.23809814453125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07105468586087227, "epoch": 0.31432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8733210098312493e-06, "loss": 0.0, "num_tokens": 178125191.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07386977970600128, "epoch": 0.3144, "grad_norm": 0.0, "learning_rate": 2.872965968446363e-06, "loss": 0.0, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5546875, "completions/mean_terminated_length": 210.25001525878906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07382987812161446, "epoch": 0.31448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8726108484657193e-06, "loss": 0.0, "num_tokens": 178220750.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07332389801740646, "epoch": 0.31456, "grad_norm": 0.0, "learning_rate": 2.8722556499141734e-06, "loss": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.890625, "completions/mean_terminated_length": 232.48077392578125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06699561327695847, "epoch": 0.31464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.871900372816585e-06, "loss": 0.0, "num_tokens": 178316608.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06706458330154419, "epoch": 0.31472, "grad_norm": 0.0, "learning_rate": 2.87154501719782e-06, "loss": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.78125, "completions/mean_terminated_length": 235.94871520996094, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06873761489987373, "epoch": 0.3148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8711895830827493e-06, "loss": 0.0, "num_tokens": 178413348.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07095124199986458, "epoch": 0.31488, "grad_norm": 0.0, "learning_rate": 2.8708340704962506e-06, "loss": 0.0, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.046875, "completions/mean_terminated_length": 236.38461303710938, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06881583854556084, "epoch": 0.31496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8704784794632045e-06, "loss": 0.0, "num_tokens": 178510122.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06712205708026886, "epoch": 0.31504, "grad_norm": 0.0, "learning_rate": 2.8701228100084998e-06, "loss": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.9375, "completions/mean_terminated_length": 232.51612854003906, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.07238728925585747, "epoch": 0.31512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8697670621570287e-06, "loss": 0.0, "num_tokens": 178606242.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06805041432380676, "epoch": 0.3152, "grad_norm": 0.0, "learning_rate": 2.8694112359336906e-06, "loss": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.7578125, "completions/mean_terminated_length": 231.7857208251953, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06744763255119324, "epoch": 0.31528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8690553313633886e-06, "loss": 0.0, "num_tokens": 178702851.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06560862809419632, "epoch": 0.31536, "grad_norm": 0.0, "learning_rate": 2.8686993484710337e-06, "loss": 0.0, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.8671875, "completions/mean_terminated_length": 238.23287963867188, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.0723671019077301, "epoch": 0.31544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8683432872815395e-06, "loss": 0.0, "num_tokens": 178799858.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07560243085026741, "epoch": 0.31552, "grad_norm": 0.0, "learning_rate": 2.867987147819827e-06, "loss": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.28125, "completions/mean_terminated_length": 209.70834350585938, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07620994746685028, "epoch": 0.3156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.867630930110823e-06, "loss": 0.0, "num_tokens": 178893718.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07228846102952957, "epoch": 0.31568, "grad_norm": 0.0, "learning_rate": 2.8672746341794576e-06, "loss": 0.0, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.7734375, "completions/mean_terminated_length": 220.60714721679688, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.07002337276935577, "epoch": 0.31576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8669182600506693e-06, "loss": 0.0, "num_tokens": 178989049.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06969457119703293, "epoch": 0.31584, "grad_norm": 0.0, "learning_rate": 2.866561807749399e-06, "loss": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.7421875, "completions/mean_terminated_length": 231.2784881591797, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.07050774618983269, "epoch": 0.31592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8662052773005956e-06, "loss": 0.0, "num_tokens": 179085400.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07004467397928238, "epoch": 0.316, "grad_norm": 0.0, "learning_rate": 2.8658486687292116e-06, "loss": 0.0, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4140625, "completions/mean_terminated_length": 241.10989379882812, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06778257712721825, "epoch": 0.31608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.865491982060207e-06, "loss": 0.0, "num_tokens": 179182349.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06890708953142166, "epoch": 0.31616, "grad_norm": 0.0, "learning_rate": 2.8651352173185454e-06, "loss": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.5859375, "completions/mean_terminated_length": 224.14865112304688, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.07246942073106766, "epoch": 0.31624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8647783745291963e-06, "loss": 0.0, "num_tokens": 179278296.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0743614211678505, "epoch": 0.31632, "grad_norm": 0.0, "learning_rate": 2.864421453717135e-06, "loss": 0.0, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.109375, "completions/mean_terminated_length": 212.96104431152344, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07380681857466698, "epoch": 0.3164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8640644549073436e-06, "loss": 0.0, "num_tokens": 179373286.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07277899608016014, "epoch": 0.31648, "grad_norm": 0.0, "learning_rate": 2.863707378124806e-06, "loss": 0.0, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 195.8918914794922, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07193569466471672, "epoch": 0.31656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8633502233945153e-06, "loss": 0.0, "num_tokens": 179464918.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07287402078509331, "epoch": 0.31664, "grad_norm": 0.0, "learning_rate": 2.8629929907414686e-06, "loss": 0.0, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.09375, "completions/mean_terminated_length": 181.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06754925847053528, "epoch": 0.31672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8626356801906673e-06, "loss": 0.0, "num_tokens": 179557858.0, "reward": 0.022260108962655067, "reward_std": 0.0, "rewards/reward_fn/mean": 0.022260108962655067, "rewards/reward_fn/std": 0.059126123785972595, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06782492250204086, "epoch": 0.3168, "grad_norm": 0.0, "learning_rate": 2.8622782917671207e-06, "loss": 0.0, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.859375, "completions/mean_terminated_length": 232.3098602294922, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07451244816184044, "epoch": 0.31688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.861920825495841e-06, "loss": 0.0, "num_tokens": 179654480.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07344105839729309, "epoch": 0.31696, "grad_norm": 0.0, "learning_rate": 2.8615632814018476e-06, "loss": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 230.72463989257812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06579166650772095, "epoch": 0.31704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.861205659510165e-06, "loss": 0.0, "num_tokens": 179751040.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07030333951115608, "epoch": 0.31712, "grad_norm": 0.0, "learning_rate": 2.8608479598458228e-06, "loss": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.4375, "completions/mean_terminated_length": 187.6923065185547, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0707140602171421, "epoch": 0.3172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8604901824338563e-06, "loss": 0.0, "num_tokens": 179843128.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07112126052379608, "epoch": 0.31728, "grad_norm": 0.0, "learning_rate": 2.860132327299306e-06, "loss": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.484375, "completions/mean_terminated_length": 238.96875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.06899940595030785, "epoch": 0.31736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.859774394467218e-06, "loss": 0.0, "num_tokens": 179940342.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07317359745502472, "epoch": 0.31744, "grad_norm": 0.0, "learning_rate": 2.8594163839626435e-06, "loss": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.21875, "completions/mean_terminated_length": 213.65516662597656, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07122565060853958, "epoch": 0.31752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.85905829581064e-06, "loss": 0.0, "num_tokens": 180034962.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07331375777721405, "epoch": 0.3176, "grad_norm": 0.0, "learning_rate": 2.85870013003627e-06, "loss": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.6640625, "completions/mean_terminated_length": 216.68539428710938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07313789054751396, "epoch": 0.31768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8583418866646e-06, "loss": 0.0, "num_tokens": 180129767.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032495364546776, "epoch": 0.31776, "grad_norm": 0.0, "learning_rate": 2.857983565720705e-06, "loss": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.0859375, "completions/mean_terminated_length": 235.5287322998047, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.078624177724123, "epoch": 0.31784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.857625167229663e-06, "loss": 0.0, "num_tokens": 180226290.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0757593922317028, "epoch": 0.31792, "grad_norm": 0.0, "learning_rate": 2.8572666912165577e-06, "loss": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.9453125, "completions/mean_terminated_length": 188.40660095214844, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07691764831542969, "epoch": 0.318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.856908137706479e-06, "loss": 0.0, "num_tokens": 180318443.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07285283878445625, "epoch": 0.31808, "grad_norm": 0.0, "learning_rate": 2.8565495067245222e-06, "loss": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.7109375, "completions/mean_terminated_length": 186.4605255126953, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06958691030740738, "epoch": 0.31816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8561907982957866e-06, "loss": 0.0, "num_tokens": 180411462.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06810364127159119, "epoch": 0.31824, "grad_norm": 0.0, "learning_rate": 2.855832012445379e-06, "loss": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.9453125, "completions/mean_terminated_length": 216.74488830566406, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07327448949217796, "epoch": 0.31832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.855473149198411e-06, "loss": 0.0, "num_tokens": 180505919.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06847875192761421, "epoch": 0.3184, "grad_norm": 0.0, "learning_rate": 2.855114208579998e-06, "loss": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 220.1052703857422, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06907721608877182, "epoch": 0.31848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8547551906152623e-06, "loss": 0.0, "num_tokens": 180601495.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07246102020144463, "epoch": 0.31856, "grad_norm": 0.0, "learning_rate": 2.854396095329332e-06, "loss": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 235.68255615234375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06596842408180237, "epoch": 0.31864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8540369227473397e-06, "loss": 0.0, "num_tokens": 180698519.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0682225339114666, "epoch": 0.31872, "grad_norm": 0.0, "learning_rate": 2.853677672894424e-06, "loss": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.2890625, "completions/mean_terminated_length": 222.8058319091797, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06445256061851978, "epoch": 0.3188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8533183457957275e-06, "loss": 0.0, "num_tokens": 180793404.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06461957097053528, "epoch": 0.31888, "grad_norm": 0.0, "learning_rate": 2.8529589414764e-06, "loss": 0.0, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.09375, "completions/mean_terminated_length": 215.20001220703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.05898953787982464, "epoch": 0.31896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8525994599615965e-06, "loss": 0.0, "num_tokens": 180888648.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06553537212312222, "epoch": 0.31904, "grad_norm": 0.0, "learning_rate": 2.8522399012764764e-06, "loss": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.1953125, "completions/mean_terminated_length": 221.85897827148438, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07551023736596107, "epoch": 0.31912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.851880265446205e-06, "loss": 0.0, "num_tokens": 180984289.0, "reward": 0.12497152388095856, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12497152388095856, "rewards/reward_fn/std": 0.33194276690483093, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07530783116817474, "epoch": 0.3192, "grad_norm": 0.0, "learning_rate": 2.851520552495953e-06, "loss": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.2265625, "completions/mean_terminated_length": 225.76055908203125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.07796178758144379, "epoch": 0.31928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.851160762450897e-06, "loss": 0.0, "num_tokens": 181080446.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07630223408341408, "epoch": 0.31936, "grad_norm": 0.0, "learning_rate": 2.8508008953362177e-06, "loss": 0.0, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.1640625, "completions/mean_terminated_length": 220.6777801513672, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.07097968459129333, "epoch": 0.31944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.850440951177102e-06, "loss": 0.0, "num_tokens": 181175571.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0685010552406311, "epoch": 0.31952, "grad_norm": 0.0, "learning_rate": 2.8500809299987435e-06, "loss": 0.0, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.8359375, "completions/mean_terminated_length": 219.19781494140625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.0726551041007042, "epoch": 0.3196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.849720831826338e-06, "loss": 0.0, "num_tokens": 181270526.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07097001001238823, "epoch": 0.31968, "grad_norm": 0.0, "learning_rate": 2.84936065668509e-06, "loss": 0.0, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 226.87640380859375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.05636790208518505, "epoch": 0.31976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.849000404600208e-06, "loss": 0.0, "num_tokens": 181366238.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05779581516981125, "epoch": 0.31984, "grad_norm": 0.0, "learning_rate": 2.8486400755969045e-06, "loss": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.3671875, "completions/mean_terminated_length": 233.8229217529297, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.0701025016605854, "epoch": 0.31992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8482796697004e-06, "loss": 0.0, "num_tokens": 181462413.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674302726984024, "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 2.847919186935918e-06, "loss": 0.0, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.71875, "completions/mean_terminated_length": 208.50575256347656, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06693456321954727, "epoch": 0.32008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.84755862732869e-06, "loss": 0.0, "num_tokens": 181556585.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06420034542679787, "epoch": 0.32016, "grad_norm": 0.0, "learning_rate": 2.84719799090395e-06, "loss": 0.0, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.09375, "completions/mean_terminated_length": 240.89552307128906, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06751808524131775, "epoch": 0.32024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8468372776869395e-06, "loss": 0.0, "num_tokens": 181653877.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06985592842102051, "epoch": 0.32032, "grad_norm": 0.0, "learning_rate": 2.8464764877029036e-06, "loss": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 215.2653045654297, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06427551805973053, "epoch": 0.3204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8461156209770958e-06, "loss": 0.0, "num_tokens": 181750185.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06250563263893127, "epoch": 0.32048, "grad_norm": 0.0, "learning_rate": 2.8457546775347706e-06, "loss": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.171875, "completions/mean_terminated_length": 209.82418823242188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06847020983695984, "epoch": 0.32056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8453936574011917e-06, "loss": 0.0, "num_tokens": 181844287.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06878373771905899, "epoch": 0.32064, "grad_norm": 0.0, "learning_rate": 2.845032560601626e-06, "loss": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.9765625, "completions/mean_terminated_length": 230.742431640625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.0704880878329277, "epoch": 0.32072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8446713871613476e-06, "loss": 0.0, "num_tokens": 181940924.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07189737260341644, "epoch": 0.3208, "grad_norm": 0.0, "learning_rate": 2.8443101371056333e-06, "loss": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.0703125, "completions/mean_terminated_length": 208.47674560546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06818320602178574, "epoch": 0.32088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8439488104597676e-06, "loss": 0.0, "num_tokens": 182035141.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06723343580961227, "epoch": 0.32096, "grad_norm": 0.0, "learning_rate": 2.84358740724904e-06, "loss": 0.0, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.109375, "completions/mean_terminated_length": 236.37930297851562, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.0717865601181984, "epoch": 0.32104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.843225927498744e-06, "loss": 0.0, "num_tokens": 182132307.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06967419385910034, "epoch": 0.32112, "grad_norm": 0.0, "learning_rate": 2.8428643712341796e-06, "loss": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.140625, "completions/mean_terminated_length": 235.28570556640625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0654161348938942, "epoch": 0.3212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.842502738480652e-06, "loss": 0.0, "num_tokens": 182228581.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061092084273695946, "epoch": 0.32128, "grad_norm": 0.0, "learning_rate": 2.842141029263472e-06, "loss": 0.0, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.0390625, "completions/mean_terminated_length": 212.23287963867188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06893177330493927, "epoch": 0.32136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8417792436079548e-06, "loss": 0.0, "num_tokens": 182323690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06641891971230507, "epoch": 0.32144, "grad_norm": 0.0, "learning_rate": 2.841417381539422e-06, "loss": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.734375, "completions/mean_terminated_length": 208.3260955810547, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06287398561835289, "epoch": 0.32152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8410554430832007e-06, "loss": 0.0, "num_tokens": 182417608.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06031826138496399, "epoch": 0.3216, "grad_norm": 0.0, "learning_rate": 2.840693428264621e-06, "loss": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.6796875, "completions/mean_terminated_length": 221.6896514892578, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06837688013911247, "epoch": 0.32168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8403313371090222e-06, "loss": 0.0, "num_tokens": 182512927.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06579243764281273, "epoch": 0.32176, "grad_norm": 0.0, "learning_rate": 2.839969169641745e-06, "loss": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.3828125, "completions/mean_terminated_length": 191.8543701171875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06864973902702332, "epoch": 0.32184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.839606925888139e-06, "loss": 0.0, "num_tokens": 182604624.0, "reward": 0.759978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.759978711605072, "rewards/reward_fn/std": 1.2986160516738892, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06627876684069633, "epoch": 0.32192, "grad_norm": 0.0, "learning_rate": 2.8392446058735563e-06, "loss": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.8046875, "completions/mean_terminated_length": 218.53623962402344, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07132448628544807, "epoch": 0.322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8388822096233555e-06, "loss": 0.0, "num_tokens": 182700343.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06857519596815109, "epoch": 0.32208, "grad_norm": 0.0, "learning_rate": 2.8385197371629013e-06, "loss": 0.0, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.28125, "completions/mean_terminated_length": 215.11764526367188, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07279583439230919, "epoch": 0.32216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8381571885175616e-06, "loss": 0.0, "num_tokens": 182795867.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07079675421118736, "epoch": 0.32224, "grad_norm": 0.0, "learning_rate": 2.837794563712713e-06, "loss": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.953125, "completions/mean_terminated_length": 234.58975219726562, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07147575914859772, "epoch": 0.32232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.837431862773733e-06, "loss": 0.0, "num_tokens": 182892501.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0733051672577858, "epoch": 0.3224, "grad_norm": 0.0, "learning_rate": 2.837069085726009e-06, "loss": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.3515625, "completions/mean_terminated_length": 235.8513641357422, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07054116949439049, "epoch": 0.32248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8367062325949306e-06, "loss": 0.0, "num_tokens": 182989314.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07027445361018181, "epoch": 0.32256, "grad_norm": 0.0, "learning_rate": 2.836343303405893e-06, "loss": 0.0, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.5234375, "completions/mean_terminated_length": 227.81719970703125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.0691993311047554, "epoch": 0.32264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8359802981842986e-06, "loss": 0.0, "num_tokens": 183084997.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07145925238728523, "epoch": 0.32272, "grad_norm": 0.0, "learning_rate": 2.8356172169555533e-06, "loss": 0.0, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.2890625, "completions/mean_terminated_length": 207.24359130859375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06746140494942665, "epoch": 0.3228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8352540597450693e-06, "loss": 0.0, "num_tokens": 183179498.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06882794201374054, "epoch": 0.32288, "grad_norm": 0.0, "learning_rate": 2.834890826578263e-06, "loss": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.7578125, "completions/mean_terminated_length": 213.4615478515625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.08311106637120247, "epoch": 0.32296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.834527517480558e-06, "loss": 0.0, "num_tokens": 183273931.0, "reward": 0.7549973130226135, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7549973130226135, "rewards/reward_fn/std": 1.3013103008270264, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07842811942100525, "epoch": 0.32304, "grad_norm": 0.0, "learning_rate": 2.8341641324773817e-06, "loss": 0.0, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 201.765625, "completions/mean_terminated_length": 182.92633056640625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07042384147644043, "epoch": 0.32312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.833800671594167e-06, "loss": 0.0, "num_tokens": 183365293.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07605878636240959, "epoch": 0.3232, "grad_norm": 0.0, "learning_rate": 2.833437134856352e-06, "loss": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 227.574462890625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06973884999752045, "epoch": 0.32328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8330735222893807e-06, "loss": 0.0, "num_tokens": 183460925.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07231717556715012, "epoch": 0.32336, "grad_norm": 0.0, "learning_rate": 2.8327098339187025e-06, "loss": 0.0, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.6484375, "completions/mean_terminated_length": 227.10101318359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06186499632894993, "epoch": 0.32344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8323460697697714e-06, "loss": 0.0, "num_tokens": 183556368.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06194099970161915, "epoch": 0.32352, "grad_norm": 0.0, "learning_rate": 2.8319822298680474e-06, "loss": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.078125, "completions/mean_terminated_length": 232.57470703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.061265915632247925, "epoch": 0.3236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8316183142389953e-06, "loss": 0.0, "num_tokens": 183652634.0, "reward": 0.08879508078098297, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08879508078098297, "rewards/reward_fn/std": 0.166224405169487, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061018744483590126, "epoch": 0.32368, "grad_norm": 0.0, "learning_rate": 2.8312543229080854e-06, "loss": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 202.171875, "completions/mean_terminated_length": 191.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06592342257499695, "epoch": 0.32376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8308902559007926e-06, "loss": 0.0, "num_tokens": 183744048.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06909085810184479, "epoch": 0.32384, "grad_norm": 0.0, "learning_rate": 2.8305261132425983e-06, "loss": 0.0, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.5703125, "completions/mean_terminated_length": 237.31521606445312, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.0575595423579216, "epoch": 0.32392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8301618949589883e-06, "loss": 0.0, "num_tokens": 183840633.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05840107053518295, "epoch": 0.324, "grad_norm": 0.0, "learning_rate": 2.8297976010754546e-06, "loss": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 204.65625, "completions/mean_terminated_length": 197.32144165039062, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06767700985074043, "epoch": 0.32408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.829433231617494e-06, "loss": 0.0, "num_tokens": 183932365.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07088359445333481, "epoch": 0.32416, "grad_norm": 0.0, "learning_rate": 2.8290687866106085e-06, "loss": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.453125, "completions/mean_terminated_length": 238.19276428222656, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06748392805457115, "epoch": 0.32424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.828704266080304e-06, "loss": 0.0, "num_tokens": 184029191.0, "reward": 0.4977909028530121, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4977909028530121, "rewards/reward_fn/std": 1.0028369426727295, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0675201267004013, "epoch": 0.32432, "grad_norm": 0.0, "learning_rate": 2.828339670052095e-06, "loss": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.59375, "completions/mean_terminated_length": 218.89794921875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.062198638916015625, "epoch": 0.3244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8279749985514977e-06, "loss": 0.0, "num_tokens": 184123859.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06423508003354073, "epoch": 0.32448, "grad_norm": 0.0, "learning_rate": 2.827610251604037e-06, "loss": 0.0, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.5234375, "completions/mean_terminated_length": 213.046875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.0704004243016243, "epoch": 0.32456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8272454292352394e-06, "loss": 0.0, "num_tokens": 184219414.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0698675587773323, "epoch": 0.32464, "grad_norm": 0.0, "learning_rate": 2.82688053147064e-06, "loss": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 245.5390625, "completions/mean_terminated_length": 229.22000122070312, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06869212165474892, "epoch": 0.32472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8265155583357777e-06, "loss": 0.0, "num_tokens": 184316379.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06685098260641098, "epoch": 0.3248, "grad_norm": 0.0, "learning_rate": 2.8261505098561962e-06, "loss": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 212.09754943847656, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06784158200025558, "epoch": 0.32488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8257853860574456e-06, "loss": 0.0, "num_tokens": 184411083.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06590418890118599, "epoch": 0.32496, "grad_norm": 0.0, "learning_rate": 2.8254201869650804e-06, "loss": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.78125, "completions/mean_terminated_length": 235.4857177734375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06593677029013634, "epoch": 0.32504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.82505491260466e-06, "loss": 0.0, "num_tokens": 184507951.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0661749504506588, "epoch": 0.32512, "grad_norm": 0.0, "learning_rate": 2.824689563001751e-06, "loss": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.8203125, "completions/mean_terminated_length": 206.7283935546875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07769059762358665, "epoch": 0.3252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.824324138181923e-06, "loss": 0.0, "num_tokens": 184602264.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08004432544112206, "epoch": 0.32528, "grad_norm": 0.0, "learning_rate": 2.8239586381707535e-06, "loss": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.8125, "completions/mean_terminated_length": 225.08334350585938, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06211455911397934, "epoch": 0.32536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.823593062993821e-06, "loss": 0.0, "num_tokens": 184697600.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06760568171739578, "epoch": 0.32544, "grad_norm": 0.0, "learning_rate": 2.8232274126767148e-06, "loss": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.6953125, "completions/mean_terminated_length": 231.2899932861328, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06435820087790489, "epoch": 0.32552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8228616872450243e-06, "loss": 0.0, "num_tokens": 184793433.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06454909965395927, "epoch": 0.3256, "grad_norm": 0.0, "learning_rate": 2.822495886724347e-06, "loss": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.2890625, "completions/mean_terminated_length": 210.2916717529297, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07368587329983711, "epoch": 0.32568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8221300111402858e-06, "loss": 0.0, "num_tokens": 184888446.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07038174197077751, "epoch": 0.32576, "grad_norm": 0.0, "learning_rate": 2.8217640605184476e-06, "loss": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.140625, "completions/mean_terminated_length": 227.59573364257812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.0699879340827465, "epoch": 0.32584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.821398034884445e-06, "loss": 0.0, "num_tokens": 184984080.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07148624956607819, "epoch": 0.32592, "grad_norm": 0.0, "learning_rate": 2.8210319342638963e-06, "loss": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.6640625, "completions/mean_terminated_length": 235.83517456054688, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06609093025326729, "epoch": 0.326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8206657586824244e-06, "loss": 0.0, "num_tokens": 185080549.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.068280428647995, "epoch": 0.32608, "grad_norm": 0.0, "learning_rate": 2.8202995081656574e-06, "loss": 0.0, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.5234375, "completions/mean_terminated_length": 205.74488830566406, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.0755794569849968, "epoch": 0.32616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8199331827392296e-06, "loss": 0.0, "num_tokens": 185173928.0, "reward": 0.4384971857070923, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4384971857070923, "rewards/reward_fn/std": 0.9861915111541748, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08045640587806702, "epoch": 0.32624, "grad_norm": 0.0, "learning_rate": 2.81956678242878e-06, "loss": 0.0, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.1953125, "completions/mean_terminated_length": 239.7012939453125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.07284042984247208, "epoch": 0.32632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8192003072599514e-06, "loss": 0.0, "num_tokens": 185270977.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07446407899260521, "epoch": 0.3264, "grad_norm": 0.0, "learning_rate": 2.818833757258395e-06, "loss": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.9296875, "completions/mean_terminated_length": 216.6326446533203, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.06731745973229408, "epoch": 0.32648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8184671324497643e-06, "loss": 0.0, "num_tokens": 185367352.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06879789382219315, "epoch": 0.32656, "grad_norm": 0.0, "learning_rate": 2.8181004328597194e-06, "loss": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 220.859375, "completions/mean_terminated_length": 208.6526336669922, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06792431697249413, "epoch": 0.32664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.817733658513926e-06, "loss": 0.0, "num_tokens": 185461158.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06831862404942513, "epoch": 0.32672, "grad_norm": 0.0, "learning_rate": 2.8173668094380534e-06, "loss": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.0546875, "completions/mean_terminated_length": 209.8865966796875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06911158934235573, "epoch": 0.3268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8169998856577776e-06, "loss": 0.0, "num_tokens": 185554989.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06860724464058876, "epoch": 0.32688, "grad_norm": 0.0, "learning_rate": 2.81663288719878e-06, "loss": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 205.90476989746094, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07736466079950333, "epoch": 0.32696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8162658140867454e-06, "loss": 0.0, "num_tokens": 185649085.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07822505012154579, "epoch": 0.32704, "grad_norm": 0.0, "learning_rate": 2.8158986663473666e-06, "loss": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5390625, "completions/mean_terminated_length": 234.7391357421875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07242729142308235, "epoch": 0.32712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8155314440063385e-06, "loss": 0.0, "num_tokens": 185745922.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07274888455867767, "epoch": 0.3272, "grad_norm": 0.0, "learning_rate": 2.815164147089363e-06, "loss": 0.0, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.640625, "completions/mean_terminated_length": 227.84091186523438, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.0707184486091137, "epoch": 0.32728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8147967756221482e-06, "loss": 0.0, "num_tokens": 185841748.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06661075353622437, "epoch": 0.32736, "grad_norm": 0.0, "learning_rate": 2.8144293296304056e-06, "loss": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.9765625, "completions/mean_terminated_length": 229.24176025390625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07186675816774368, "epoch": 0.32744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.814061809139852e-06, "loss": 0.0, "num_tokens": 185937617.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0710924044251442, "epoch": 0.32752, "grad_norm": 0.0, "learning_rate": 2.8136942141762106e-06, "loss": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.203125, "completions/mean_terminated_length": 236.88607788085938, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07673167064785957, "epoch": 0.3276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8133265447652087e-06, "loss": 0.0, "num_tokens": 186034411.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07679031044244766, "epoch": 0.32768, "grad_norm": 0.0, "learning_rate": 2.81295880093258e-06, "loss": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 224.140625, "completions/mean_terminated_length": 216.78846740722656, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06965886056423187, "epoch": 0.32776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8125909827040615e-06, "loss": 0.0, "num_tokens": 186128637.0, "reward": 0.45364314317703247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45364314317703247, "rewards/reward_fn/std": 0.9880856871604919, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06911331787705421, "epoch": 0.32784, "grad_norm": 0.0, "learning_rate": 2.8122230901053976e-06, "loss": 0.0, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.9375, "completions/mean_terminated_length": 239.05262756347656, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06596416607499123, "epoch": 0.32792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.811855123162337e-06, "loss": 0.0, "num_tokens": 186225653.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06775682419538498, "epoch": 0.328, "grad_norm": 0.0, "learning_rate": 2.8114870819006325e-06, "loss": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.0390625, "completions/mean_terminated_length": 239.242431640625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06816786155104637, "epoch": 0.32808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8111189663460443e-06, "loss": 0.0, "num_tokens": 186322298.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06961603835225105, "epoch": 0.32816, "grad_norm": 0.0, "learning_rate": 2.8107507765243354e-06, "loss": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.3359375, "completions/mean_terminated_length": 240.59722900390625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06916849315166473, "epoch": 0.32824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.810382512461276e-06, "loss": 0.0, "num_tokens": 186419493.0, "reward": 0.10638301074504852, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10638301074504852, "rewards/reward_fn/std": 0.28256893157958984, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06909486278891563, "epoch": 0.32832, "grad_norm": 0.0, "learning_rate": 2.8100141741826404e-06, "loss": 0.0, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.984375, "completions/mean_terminated_length": 226.15533447265625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.06672153249382973, "epoch": 0.3284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8096457617142085e-06, "loss": 0.0, "num_tokens": 186514723.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06728941574692726, "epoch": 0.32848, "grad_norm": 0.0, "learning_rate": 2.809277275081766e-06, "loss": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 198.61727905273438, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07991360127925873, "epoch": 0.32856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8089087143111008e-06, "loss": 0.0, "num_tokens": 186608379.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0779346153140068, "epoch": 0.32864, "grad_norm": 0.0, "learning_rate": 2.808540079428011e-06, "loss": 0.0, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.265625, "completions/mean_terminated_length": 213.24752807617188, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06406411528587341, "epoch": 0.32872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8081713704582954e-06, "loss": 0.0, "num_tokens": 186702365.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06400828063488007, "epoch": 0.3288, "grad_norm": 0.0, "learning_rate": 2.8078025874277607e-06, "loss": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.328125, "completions/mean_terminated_length": 214.65625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07070738449692726, "epoch": 0.32888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.807433730362217e-06, "loss": 0.0, "num_tokens": 186798023.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07008856534957886, "epoch": 0.32896, "grad_norm": 0.0, "learning_rate": 2.807064799287481e-06, "loss": 0.0, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.15625, "completions/mean_terminated_length": 220.60760498046875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.07824817299842834, "epoch": 0.32904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8066957942293735e-06, "loss": 0.0, "num_tokens": 186893531.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.078705795109272, "epoch": 0.32912, "grad_norm": 0.0, "learning_rate": 2.806326715213722e-06, "loss": 0.0, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 211.296875, "completions/mean_terminated_length": 184.47500610351562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06498072296380997, "epoch": 0.3292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8059575622663566e-06, "loss": 0.0, "num_tokens": 186986113.0, "reward": 0.022260108962655067, "reward_std": 0.0, "rewards/reward_fn/mean": 0.022260108962655067, "rewards/reward_fn/std": 0.059126123785972595, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06466332077980042, "epoch": 0.32928, "grad_norm": 0.0, "learning_rate": 2.8055883354131145e-06, "loss": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 209.87232971191406, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06344539485871792, "epoch": 0.32936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.805219034679839e-06, "loss": 0.0, "num_tokens": 187080081.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06426199153065681, "epoch": 0.32944, "grad_norm": 0.0, "learning_rate": 2.8048496600923757e-06, "loss": 0.0, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.7734375, "completions/mean_terminated_length": 204.20652770996094, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.061388326808810234, "epoch": 0.32952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.804480211676577e-06, "loss": 0.0, "num_tokens": 187173620.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05804102495312691, "epoch": 0.3296, "grad_norm": 0.0, "learning_rate": 2.8041106894583025e-06, "loss": 0.0, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.3515625, "completions/mean_terminated_length": 226.4027862548828, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.07657384499907494, "epoch": 0.32968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.803741093463412e-06, "loss": 0.0, "num_tokens": 187269793.0, "reward": 0.07864314317703247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07864314317703247, "rewards/reward_fn/std": 0.20888777077198029, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07514046877622604, "epoch": 0.32976, "grad_norm": 0.0, "learning_rate": 2.8033714237177748e-06, "loss": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.9453125, "completions/mean_terminated_length": 238.8400115966797, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.07275056466460228, "epoch": 0.32984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.803001680247264e-06, "loss": 0.0, "num_tokens": 187366810.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07203947380185127, "epoch": 0.32992, "grad_norm": 0.0, "learning_rate": 2.802631863077757e-06, "loss": 0.0, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.2109375, "completions/mean_terminated_length": 239.3833465576172, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07032693922519684, "epoch": 0.33, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.802261972235137e-06, "loss": 0.0, "num_tokens": 187464117.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07198380306363106, "epoch": 0.33008, "grad_norm": 0.0, "learning_rate": 2.801892007745294e-06, "loss": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.828125, "completions/mean_terminated_length": 220.63014221191406, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06709983199834824, "epoch": 0.33016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.8015219696341204e-06, "loss": 0.0, "num_tokens": 187559839.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06815049424767494, "epoch": 0.33024, "grad_norm": 0.0, "learning_rate": 2.8011518579275143e-06, "loss": 0.0, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.90625, "completions/mean_terminated_length": 220.4736785888672, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.06858641654253006, "epoch": 0.33032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.800781672651381e-06, "loss": 0.0, "num_tokens": 187655443.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06597167998552322, "epoch": 0.3304, "grad_norm": 0.0, "learning_rate": 2.800411413831629e-06, "loss": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.1484375, "completions/mean_terminated_length": 235.26864624023438, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.07763437926769257, "epoch": 0.33048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.800041081494172e-06, "loss": 0.0, "num_tokens": 187752358.0, "reward": 0.09723600745201111, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09723600745201111, "rewards/reward_fn/std": 0.2582731544971466, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07435357570648193, "epoch": 0.33056, "grad_norm": 0.0, "learning_rate": 2.7996706756649303e-06, "loss": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.2890625, "completions/mean_terminated_length": 223.7244873046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06955625116825104, "epoch": 0.33064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7993001963698276e-06, "loss": 0.0, "num_tokens": 187847499.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07131002098321915, "epoch": 0.33072, "grad_norm": 0.0, "learning_rate": 2.798929643634794e-06, "loss": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.4296875, "completions/mean_terminated_length": 217.09573364257812, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06816637516021729, "epoch": 0.3308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.798559017485764e-06, "loss": 0.0, "num_tokens": 187942146.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06790023669600487, "epoch": 0.33088, "grad_norm": 0.0, "learning_rate": 2.7981883179486778e-06, "loss": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.8671875, "completions/mean_terminated_length": 238.6700897216797, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.06795407086610794, "epoch": 0.33096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7978175450494804e-06, "loss": 0.0, "num_tokens": 188038769.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07106026634573936, "epoch": 0.33104, "grad_norm": 0.0, "learning_rate": 2.7974466988141214e-06, "loss": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.6640625, "completions/mean_terminated_length": 230.9365234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.07017554342746735, "epoch": 0.33112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.797075779268557e-06, "loss": 0.0, "num_tokens": 188135494.0, "reward": 0.14061084389686584, "reward_std": 0.0, "rewards/reward_fn/mean": 0.14061084389686584, "rewards/reward_fn/std": 0.2460259050130844, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0730074793100357, "epoch": 0.3312, "grad_norm": 0.0, "learning_rate": 2.796704786438748e-06, "loss": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.4609375, "completions/mean_terminated_length": 231.18667602539062, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06643512845039368, "epoch": 0.33128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7963337203506582e-06, "loss": 0.0, "num_tokens": 188231937.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06579846143722534, "epoch": 0.33136, "grad_norm": 0.0, "learning_rate": 2.795962581030259e-06, "loss": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.4140625, "completions/mean_terminated_length": 220.6901397705078, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.06662314757704735, "epoch": 0.33144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7955913685035276e-06, "loss": 0.0, "num_tokens": 188327734.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06682392954826355, "epoch": 0.33152, "grad_norm": 0.0, "learning_rate": 2.7952200827964436e-06, "loss": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.0078125, "completions/mean_terminated_length": 199.6999969482422, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06883150339126587, "epoch": 0.3316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7948487239349932e-06, "loss": 0.0, "num_tokens": 188423223.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06697703152894974, "epoch": 0.33168, "grad_norm": 0.0, "learning_rate": 2.794477291945168e-06, "loss": 0.0, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 238.7734375, "completions/mean_terminated_length": 225.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06713908910751343, "epoch": 0.33176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7941057868529644e-06, "loss": 0.0, "num_tokens": 188519322.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0709758885204792, "epoch": 0.33184, "grad_norm": 0.0, "learning_rate": 2.7937342086843836e-06, "loss": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 195.0537567138672, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0694965124130249, "epoch": 0.33192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7933625574654317e-06, "loss": 0.0, "num_tokens": 188611958.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07537424936890602, "epoch": 0.332, "grad_norm": 0.0, "learning_rate": 2.7929908332221207e-06, "loss": 0.0, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.6953125, "completions/mean_terminated_length": 201.96511840820312, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0731603130698204, "epoch": 0.33208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7926190359804676e-06, "loss": 0.0, "num_tokens": 188705615.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879209995269775, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06625128537416458, "epoch": 0.33216, "grad_norm": 0.0, "learning_rate": 2.7922471657664937e-06, "loss": 0.0, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.703125, "completions/mean_terminated_length": 190.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06471218541264534, "epoch": 0.33224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.791875222606227e-06, "loss": 0.0, "num_tokens": 188800553.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06648541614413261, "epoch": 0.33232, "grad_norm": 0.0, "learning_rate": 2.791503206525699e-06, "loss": 0.0, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.359375, "completions/mean_terminated_length": 211.21212768554688, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06531057134270668, "epoch": 0.3324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7911311175509466e-06, "loss": 0.0, "num_tokens": 188894423.0, "reward": 0.02467191591858864, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02467191591858864, "rewards/reward_fn/std": 0.06553223729133606, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06521843001246452, "epoch": 0.33248, "grad_norm": 0.0, "learning_rate": 2.7907589557080123e-06, "loss": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.5859375, "completions/mean_terminated_length": 225.41836547851562, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07068977132439613, "epoch": 0.33256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7903867210229444e-06, "loss": 0.0, "num_tokens": 188989730.0, "reward": 0.12493911385536194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12493911385536194, "rewards/reward_fn/std": 0.33185669779777527, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07411909848451614, "epoch": 0.33264, "grad_norm": 0.0, "learning_rate": 2.7900144135217935e-06, "loss": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.65625, "completions/mean_terminated_length": 204.2133331298828, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0694592297077179, "epoch": 0.33272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.789642033230619e-06, "loss": 0.0, "num_tokens": 189084150.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07013896852731705, "epoch": 0.3328, "grad_norm": 0.0, "learning_rate": 2.7892695801754823e-06, "loss": 0.0, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.9140625, "completions/mean_terminated_length": 216.18309020996094, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06102572754025459, "epoch": 0.33288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.788897054382452e-06, "loss": 0.0, "num_tokens": 189179627.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06051500514149666, "epoch": 0.33296, "grad_norm": 0.0, "learning_rate": 2.7885244558776015e-06, "loss": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 196.7890625, "completions/mean_terminated_length": 182.41748046875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.061011943966150284, "epoch": 0.33304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.788151784687007e-06, "loss": 0.0, "num_tokens": 189270352.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.054783571511507034, "epoch": 0.33312, "grad_norm": 0.0, "learning_rate": 2.7877790408367525e-06, "loss": 0.0, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.390625, "completions/mean_terminated_length": 183.21348571777344, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0657980665564537, "epoch": 0.3332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7874062243529268e-06, "loss": 0.0, "num_tokens": 189362178.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06777861341834068, "epoch": 0.33328, "grad_norm": 0.0, "learning_rate": 2.787033335261622e-06, "loss": 0.0, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.078125, "completions/mean_terminated_length": 233.4430389404297, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07190980017185211, "epoch": 0.33336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.786660373588937e-06, "loss": 0.0, "num_tokens": 189458700.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07072125747799873, "epoch": 0.33344, "grad_norm": 0.0, "learning_rate": 2.786287339360975e-06, "loss": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.8984375, "completions/mean_terminated_length": 221.07408142089844, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07440628483891487, "epoch": 0.33352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.785914232603845e-06, "loss": 0.0, "num_tokens": 189554175.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0750110074877739, "epoch": 0.3336, "grad_norm": 0.0, "learning_rate": 2.7855410533436595e-06, "loss": 0.0, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.8984375, "completions/mean_terminated_length": 239.20779418945312, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07045421749353409, "epoch": 0.33368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.785167801606538e-06, "loss": 0.0, "num_tokens": 189651186.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0697050541639328, "epoch": 0.33376, "grad_norm": 0.0, "learning_rate": 2.7847944774186034e-06, "loss": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.3671875, "completions/mean_terminated_length": 207.9873504638672, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06894350051879883, "epoch": 0.33384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7844210808059847e-06, "loss": 0.0, "num_tokens": 189745697.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0722530260682106, "epoch": 0.33392, "grad_norm": 0.0, "learning_rate": 2.7840476117948163e-06, "loss": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.6015625, "completions/mean_terminated_length": 223.44566345214844, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07213830575346947, "epoch": 0.334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.783674070411237e-06, "loss": 0.0, "num_tokens": 189841006.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07226702198386192, "epoch": 0.33408, "grad_norm": 0.0, "learning_rate": 2.7833004566813898e-06, "loss": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.2578125, "completions/mean_terminated_length": 217.876708984375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06771597638726234, "epoch": 0.33416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7829267706314245e-06, "loss": 0.0, "num_tokens": 189936527.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07030845433473587, "epoch": 0.33424, "grad_norm": 0.0, "learning_rate": 2.7825530122874953e-06, "loss": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 238.4140625, "completions/mean_terminated_length": 224.2957763671875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.0731566995382309, "epoch": 0.33432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7821791816757608e-06, "loss": 0.0, "num_tokens": 190032580.0, "reward": 1.1767055988311768, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1767055988311768, "rewards/reward_fn/std": 1.4239355325698853, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06714170053601265, "epoch": 0.3344, "grad_norm": 0.0, "learning_rate": 2.7818052788223856e-06, "loss": 0.0, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.7265625, "completions/mean_terminated_length": 237.12222290039062, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06778324395418167, "epoch": 0.33448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7814313037535395e-06, "loss": 0.0, "num_tokens": 190129185.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06616028025746346, "epoch": 0.33456, "grad_norm": 0.0, "learning_rate": 2.7810572564953947e-06, "loss": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.46875, "completions/mean_terminated_length": 237.7777862548828, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06940637156367302, "epoch": 0.33464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7806831370741328e-06, "loss": 0.0, "num_tokens": 190226013.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06896475702524185, "epoch": 0.33472, "grad_norm": 0.0, "learning_rate": 2.780308945515937e-06, "loss": 0.0, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.1953125, "completions/mean_terminated_length": 229.02667236328125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06514576077461243, "epoch": 0.3348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7799346818469976e-06, "loss": 0.0, "num_tokens": 190322294.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07116060331463814, "epoch": 0.33488, "grad_norm": 0.0, "learning_rate": 2.779560346093508e-06, "loss": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.9140625, "completions/mean_terminated_length": 206.46534729003906, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07092805951833725, "epoch": 0.33496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.779185938281669e-06, "loss": 0.0, "num_tokens": 190415595.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06711054220795631, "epoch": 0.33504, "grad_norm": 0.0, "learning_rate": 2.7788114584376846e-06, "loss": 0.0, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.8359375, "completions/mean_terminated_length": 196.59140014648438, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.07371346652507782, "epoch": 0.33512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.778436906587764e-06, "loss": 0.0, "num_tokens": 190508374.0, "reward": 0.4732079803943634, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4732079803943634, "rewards/reward_fn/std": 0.992942750453949, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07066784426569939, "epoch": 0.3352, "grad_norm": 0.0, "learning_rate": 2.7780622827581228e-06, "loss": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.453125, "completions/mean_terminated_length": 217.7096710205078, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07074380666017532, "epoch": 0.33528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.777687586974979e-06, "loss": 0.0, "num_tokens": 190604304.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07171324267983437, "epoch": 0.33536, "grad_norm": 0.0, "learning_rate": 2.77731281926456e-06, "loss": 0.0, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 204.84849548339844, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06853966042399406, "epoch": 0.33544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7769379796530936e-06, "loss": 0.0, "num_tokens": 190699232.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06596488133072853, "epoch": 0.33552, "grad_norm": 0.0, "learning_rate": 2.776563068166815e-06, "loss": 0.0, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.9375, "completions/mean_terminated_length": 226.13954162597656, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.06639905646443367, "epoch": 0.3356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.776188084831964e-06, "loss": 0.0, "num_tokens": 190794968.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06483078002929688, "epoch": 0.33568, "grad_norm": 0.0, "learning_rate": 2.775813029674786e-06, "loss": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.9140625, "completions/mean_terminated_length": 215.53260803222656, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07462754473090172, "epoch": 0.33576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7754379027215304e-06, "loss": 0.0, "num_tokens": 190889549.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07060029730200768, "epoch": 0.33584, "grad_norm": 0.0, "learning_rate": 2.7750627039984526e-06, "loss": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.4453125, "completions/mean_terminated_length": 237.35556030273438, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07098780199885368, "epoch": 0.33592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.774687433531812e-06, "loss": 0.0, "num_tokens": 190987014.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07292792573571205, "epoch": 0.336, "grad_norm": 0.0, "learning_rate": 2.7743120913478737e-06, "loss": 0.0, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8359375, "completions/mean_terminated_length": 224.4387664794922, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07146425917744637, "epoch": 0.33608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.773936677472908e-06, "loss": 0.0, "num_tokens": 191082225.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0719725638628006, "epoch": 0.33616, "grad_norm": 0.0, "learning_rate": 2.7735611919331893e-06, "loss": 0.0, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.015625, "completions/mean_terminated_length": 239.1428680419922, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.07072222977876663, "epoch": 0.33624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7731856347549983e-06, "loss": 0.0, "num_tokens": 191178995.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06721284240484238, "epoch": 0.33632, "grad_norm": 0.0, "learning_rate": 2.77281000596462e-06, "loss": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 197.765625, "completions/mean_terminated_length": 180.7070770263672, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06670292094349861, "epoch": 0.3364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7724343055883437e-06, "loss": 0.0, "num_tokens": 191269845.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06166016310453415, "epoch": 0.33648, "grad_norm": 0.0, "learning_rate": 2.772058533652465e-06, "loss": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.3046875, "completions/mean_terminated_length": 235.6875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.07025149837136269, "epoch": 0.33656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.771682690183284e-06, "loss": 0.0, "num_tokens": 191366524.0, "reward": 0.08953723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08953723311424255, "rewards/reward_fn/std": 0.23782405257225037, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07115912437438965, "epoch": 0.33664, "grad_norm": 0.0, "learning_rate": 2.771306775207106e-06, "loss": 0.0, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 228.86956787109375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.06773993372917175, "epoch": 0.33672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7709307887502403e-06, "loss": 0.0, "num_tokens": 191462332.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06975771486759186, "epoch": 0.3368, "grad_norm": 0.0, "learning_rate": 2.7705547308390028e-06, "loss": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.640625, "completions/mean_terminated_length": 167.90476989746094, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07420850545167923, "epoch": 0.33688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7701786014997134e-06, "loss": 0.0, "num_tokens": 191555086.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0683513842523098, "epoch": 0.33696, "grad_norm": 0.0, "learning_rate": 2.7698024007586967e-06, "loss": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.5390625, "completions/mean_terminated_length": 203.15306091308594, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07589738070964813, "epoch": 0.33704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.769426128642283e-06, "loss": 0.0, "num_tokens": 191648211.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.076754841953516, "epoch": 0.33712, "grad_norm": 0.0, "learning_rate": 2.769049785176808e-06, "loss": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.328125, "completions/mean_terminated_length": 226.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06819117441773415, "epoch": 0.3372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7686733703886103e-06, "loss": 0.0, "num_tokens": 191744509.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06865009665489197, "epoch": 0.33728, "grad_norm": 0.0, "learning_rate": 2.768296884304037e-06, "loss": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.421875, "completions/mean_terminated_length": 229.57778930664062, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06919058039784431, "epoch": 0.33736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7679203269494363e-06, "loss": 0.0, "num_tokens": 191840435.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07088273763656616, "epoch": 0.33744, "grad_norm": 0.0, "learning_rate": 2.767543698351164e-06, "loss": 0.0, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.84375, "completions/mean_terminated_length": 175.39535522460938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.0816427432000637, "epoch": 0.33752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.76716699853558e-06, "loss": 0.0, "num_tokens": 191931807.0, "reward": 1.1274996995925903, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1274996995925903, "rewards/reward_fn/std": 1.4561455249786377, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07298748195171356, "epoch": 0.3376, "grad_norm": 0.0, "learning_rate": 2.76679022752905e-06, "loss": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8359375, "completions/mean_terminated_length": 215.30262756347656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06407587975263596, "epoch": 0.33768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7664133853579435e-06, "loss": 0.0, "num_tokens": 192027018.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06614717096090317, "epoch": 0.33776, "grad_norm": 0.0, "learning_rate": 2.766036472048635e-06, "loss": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.171875, "completions/mean_terminated_length": 178.21951293945312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06817609071731567, "epoch": 0.33784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7656594876275046e-06, "loss": 0.0, "num_tokens": 192118944.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07533273473381996, "epoch": 0.33792, "grad_norm": 0.0, "learning_rate": 2.765282432120938e-06, "loss": 0.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0390625, "completions/mean_terminated_length": 229.76712036132812, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06822032108902931, "epoch": 0.338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.764905305555324e-06, "loss": 0.0, "num_tokens": 192215333.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07073148339986801, "epoch": 0.33808, "grad_norm": 0.0, "learning_rate": 2.764528107957058e-06, "loss": 0.0, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.6015625, "completions/mean_terminated_length": 222.14515686035156, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06839988380670547, "epoch": 0.33816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7641508393525404e-06, "loss": 0.0, "num_tokens": 192311538.0, "reward": 0.49987494945526123, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49987494945526123, "rewards/reward_fn/std": 1.0038665533065796, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0658499002456665, "epoch": 0.33824, "grad_norm": 0.0, "learning_rate": 2.7637734997681753e-06, "loss": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.15625, "completions/mean_terminated_length": 229.4945068359375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06587791070342064, "epoch": 0.33832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.763396089230373e-06, "loss": 0.0, "num_tokens": 192407430.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07246619090437889, "epoch": 0.3384, "grad_norm": 0.0, "learning_rate": 2.763018607765547e-06, "loss": 0.0, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.8984375, "completions/mean_terminated_length": 208.67105102539062, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07979391515254974, "epoch": 0.33848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.762641055400119e-06, "loss": 0.0, "num_tokens": 192502137.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07758184894919395, "epoch": 0.33856, "grad_norm": 0.0, "learning_rate": 2.762263432160512e-06, "loss": 0.0, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.1953125, "completions/mean_terminated_length": 200.23287963867188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06548179686069489, "epoch": 0.33864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7618857380731565e-06, "loss": 0.0, "num_tokens": 192596370.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06370648741722107, "epoch": 0.33872, "grad_norm": 0.0, "learning_rate": 2.761507973164487e-06, "loss": 0.0, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.46875, "completions/mean_terminated_length": 222.7857208251953, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.0732322484254837, "epoch": 0.3388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7611301374609423e-06, "loss": 0.0, "num_tokens": 192692814.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07064279168844223, "epoch": 0.33888, "grad_norm": 0.0, "learning_rate": 2.760752230988968e-06, "loss": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.3828125, "completions/mean_terminated_length": 209.76828002929688, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07794153690338135, "epoch": 0.33896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7603742537750135e-06, "loss": 0.0, "num_tokens": 192787327.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0777505449950695, "epoch": 0.33904, "grad_norm": 0.0, "learning_rate": 2.759996205845532e-06, "loss": 0.0, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.3515625, "completions/mean_terminated_length": 239.9508056640625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.06555246189236641, "epoch": 0.33912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.759618087226984e-06, "loss": 0.0, "num_tokens": 192884652.0, "reward": 0.09303461015224457, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09303461015224457, "rewards/reward_fn/std": 0.24711361527442932, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06456587836146355, "epoch": 0.3392, "grad_norm": 0.0, "learning_rate": 2.759239897945834e-06, "loss": 0.0, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5234375, "completions/mean_terminated_length": 222.31082153320312, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06760422885417938, "epoch": 0.33928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7588616380285504e-06, "loss": 0.0, "num_tokens": 192980463.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06857307255268097, "epoch": 0.33936, "grad_norm": 0.0, "learning_rate": 2.7584833075016075e-06, "loss": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.2890625, "completions/mean_terminated_length": 228.57691955566406, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06459684297442436, "epoch": 0.33944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7581049063914848e-06, "loss": 0.0, "num_tokens": 193076628.0, "reward": 0.08830241858959198, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08830241858959198, "rewards/reward_fn/std": 0.23454421758651733, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06425714492797852, "epoch": 0.33952, "grad_norm": 0.0, "learning_rate": 2.757726434724666e-06, "loss": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.15625, "completions/mean_terminated_length": 230.61053466796875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06662382185459137, "epoch": 0.3396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7573478925276405e-06, "loss": 0.0, "num_tokens": 193172520.0, "reward": 0.4070621132850647, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4070621132850647, "rewards/reward_fn/std": 0.9863881468772888, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06913075223565102, "epoch": 0.33968, "grad_norm": 0.0, "learning_rate": 2.756969279826902e-06, "loss": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.7421875, "completions/mean_terminated_length": 238.672119140625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.06702916696667671, "epoch": 0.33976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.75659059664895e-06, "loss": 0.0, "num_tokens": 193269767.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07165785878896713, "epoch": 0.33984, "grad_norm": 0.0, "learning_rate": 2.7562118430202875e-06, "loss": 0.0, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.984375, "completions/mean_terminated_length": 239.17172241210938, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07321463897824287, "epoch": 0.33992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7558330189674235e-06, "loss": 0.0, "num_tokens": 193366405.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07393508404493332, "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 2.755454124516872e-06, "loss": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.6953125, "completions/mean_terminated_length": 215.16842651367188, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06300529837608337, "epoch": 0.34008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7550751596951514e-06, "loss": 0.0, "num_tokens": 193460830.0, "reward": 0.4998645484447479, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4998645484447479, "rewards/reward_fn/std": 1.0038613080978394, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06497317925095558, "epoch": 0.34016, "grad_norm": 0.0, "learning_rate": 2.7546961245287853e-06, "loss": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.171875, "completions/mean_terminated_length": 227.1591033935547, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06571976467967033, "epoch": 0.34024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7543170190443016e-06, "loss": 0.0, "num_tokens": 193556596.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06452013924717903, "epoch": 0.34032, "grad_norm": 0.0, "learning_rate": 2.7539378432682348e-06, "loss": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.5546875, "completions/mean_terminated_length": 229.98684692382812, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07095807045698166, "epoch": 0.3404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.753558597227122e-06, "loss": 0.0, "num_tokens": 193652923.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07212630659341812, "epoch": 0.34048, "grad_norm": 0.0, "learning_rate": 2.753179280947507e-06, "loss": 0.0, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.3671875, "completions/mean_terminated_length": 234.7236785888672, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06339018233120441, "epoch": 0.34056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7527998944559383e-06, "loss": 0.0, "num_tokens": 193749610.0, "reward": 0.10495676845312119, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10495676845312119, "rewards/reward_fn/std": 0.27878063917160034, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06426341831684113, "epoch": 0.34064, "grad_norm": 0.0, "learning_rate": 2.752420437778969e-06, "loss": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 238.5142822265625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.0694127306342125, "epoch": 0.34072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7520409109431556e-06, "loss": 0.0, "num_tokens": 193846690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07132020592689514, "epoch": 0.3408, "grad_norm": 0.0, "learning_rate": 2.7516613139750624e-06, "loss": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.6640625, "completions/mean_terminated_length": 231.53334045410156, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06530240178108215, "epoch": 0.34088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.751281646901257e-06, "loss": 0.0, "num_tokens": 193943159.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06683750450611115, "epoch": 0.34096, "grad_norm": 0.0, "learning_rate": 2.7509019097483115e-06, "loss": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.8125, "completions/mean_terminated_length": 201.478271484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07521982491016388, "epoch": 0.34104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.750522102542804e-06, "loss": 0.0, "num_tokens": 194036447.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07499229907989502, "epoch": 0.34112, "grad_norm": 0.0, "learning_rate": 2.7501422253113167e-06, "loss": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.90625, "completions/mean_terminated_length": 232.89552307128906, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06933672353625298, "epoch": 0.3412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7497622780804373e-06, "loss": 0.0, "num_tokens": 194133203.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07086123898625374, "epoch": 0.34128, "grad_norm": 0.0, "learning_rate": 2.7493822608767574e-06, "loss": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.546875, "completions/mean_terminated_length": 211.19540405273438, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07251500710844994, "epoch": 0.34136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7490021737268754e-06, "loss": 0.0, "num_tokens": 194227609.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0722234845161438, "epoch": 0.34144, "grad_norm": 0.0, "learning_rate": 2.748622016657392e-06, "loss": 0.0, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.921875, "completions/mean_terminated_length": 223.46835327148438, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06259972788393497, "epoch": 0.34152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7482417896949157e-06, "loss": 0.0, "num_tokens": 194323343.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06380306743085384, "epoch": 0.3416, "grad_norm": 0.0, "learning_rate": 2.747861492866057e-06, "loss": 0.0, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.34375, "completions/mean_terminated_length": 240.82191467285156, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06850704178214073, "epoch": 0.34168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.747481126197433e-06, "loss": 0.0, "num_tokens": 194420539.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06944174692034721, "epoch": 0.34176, "grad_norm": 0.0, "learning_rate": 2.7471006897156664e-06, "loss": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.6953125, "completions/mean_terminated_length": 228.5394744873047, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07819965109229088, "epoch": 0.34184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.746720183447382e-06, "loss": 0.0, "num_tokens": 194516756.0, "reward": 0.49930325150489807, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49930325150489807, "rewards/reward_fn/std": 1.003581166267395, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07866666093468666, "epoch": 0.34192, "grad_norm": 0.0, "learning_rate": 2.7463396074192128e-06, "loss": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.3984375, "completions/mean_terminated_length": 191.06097412109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0753963515162468, "epoch": 0.342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7459589616577945e-06, "loss": 0.0, "num_tokens": 194609735.0, "reward": 0.4978767931461334, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4978767931461334, "rewards/reward_fn/std": 1.0028787851333618, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08088639751076698, "epoch": 0.34208, "grad_norm": 0.0, "learning_rate": 2.7455782461897672e-06, "loss": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.40625, "completions/mean_terminated_length": 230.9199981689453, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.0700450986623764, "epoch": 0.34216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.745197461041779e-06, "loss": 0.0, "num_tokens": 194705531.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07090498507022858, "epoch": 0.34224, "grad_norm": 0.0, "learning_rate": 2.744816606240479e-06, "loss": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3203125, "completions/mean_terminated_length": 224.09091186523438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07577742636203766, "epoch": 0.34232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7444356818125245e-06, "loss": 0.0, "num_tokens": 194800676.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07495976239442825, "epoch": 0.3424, "grad_norm": 0.0, "learning_rate": 2.744054687784575e-06, "loss": 0.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.8671875, "completions/mean_terminated_length": 207.25758361816406, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06974506378173828, "epoch": 0.34248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.743673624183297e-06, "loss": 0.0, "num_tokens": 194895763.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06754709407687187, "epoch": 0.34256, "grad_norm": 0.0, "learning_rate": 2.7432924910353603e-06, "loss": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.34375, "completions/mean_terminated_length": 237.59091186523438, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07042178139090538, "epoch": 0.34264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7429112883674407e-06, "loss": 0.0, "num_tokens": 194992447.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06972788646817207, "epoch": 0.34272, "grad_norm": 0.0, "learning_rate": 2.7425300162062174e-06, "loss": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.6953125, "completions/mean_terminated_length": 212.8850555419922, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.0767856016755104, "epoch": 0.3428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7421486745783763e-06, "loss": 0.0, "num_tokens": 195087000.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0795016959309578, "epoch": 0.34288, "grad_norm": 0.0, "learning_rate": 2.741767263510607e-06, "loss": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.53125, "completions/mean_terminated_length": 202.9545440673828, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.071126788854599, "epoch": 0.34296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.741385783029604e-06, "loss": 0.0, "num_tokens": 195180636.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07281189784407616, "epoch": 0.34304, "grad_norm": 0.0, "learning_rate": 2.7410042331620676e-06, "loss": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.6953125, "completions/mean_terminated_length": 214.04615783691406, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07607756927609444, "epoch": 0.34312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7406226139347024e-06, "loss": 0.0, "num_tokens": 195276213.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07761119678616524, "epoch": 0.3432, "grad_norm": 0.0, "learning_rate": 2.7402409253742163e-06, "loss": 0.0, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.6328125, "completions/mean_terminated_length": 210.1585235595703, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06519083678722382, "epoch": 0.34328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7398591675073245e-06, "loss": 0.0, "num_tokens": 195370758.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06572434678673744, "epoch": 0.34336, "grad_norm": 0.0, "learning_rate": 2.739477340360746e-06, "loss": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5078125, "completions/mean_terminated_length": 220.27272033691406, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07130001485347748, "epoch": 0.34344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7390954439612048e-06, "loss": 0.0, "num_tokens": 195466311.0, "reward": 0.5463370084762573, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5463370084762573, "rewards/reward_fn/std": 0.98359215259552, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07101994752883911, "epoch": 0.34352, "grad_norm": 0.0, "learning_rate": 2.738713478335429e-06, "loss": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.4921875, "completions/mean_terminated_length": 216.9499969482422, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07170747593045235, "epoch": 0.3436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7383314435101526e-06, "loss": 0.0, "num_tokens": 195560710.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.071290522813797, "epoch": 0.34368, "grad_norm": 0.0, "learning_rate": 2.737949339512114e-06, "loss": 0.0, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.2421875, "completions/mean_terminated_length": 210.10842895507812, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.0763670802116394, "epoch": 0.34376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.737567166368056e-06, "loss": 0.0, "num_tokens": 195655205.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08092347905039787, "epoch": 0.34384, "grad_norm": 0.0, "learning_rate": 2.7371849241047273e-06, "loss": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.6875, "completions/mean_terminated_length": 217.85264587402344, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07114676013588905, "epoch": 0.34392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.736802612748881e-06, "loss": 0.0, "num_tokens": 195749885.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07023589313030243, "epoch": 0.344, "grad_norm": 0.0, "learning_rate": 2.7364202323272735e-06, "loss": 0.0, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.515625, "completions/mean_terminated_length": 215.2987060546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07605008035898209, "epoch": 0.34408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.736037782866669e-06, "loss": 0.0, "num_tokens": 195845055.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07479851320385933, "epoch": 0.34416, "grad_norm": 0.0, "learning_rate": 2.7356552643938337e-06, "loss": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.1875, "completions/mean_terminated_length": 226.63414001464844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06965253502130508, "epoch": 0.34424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.735272676935541e-06, "loss": 0.0, "num_tokens": 195940951.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07183913886547089, "epoch": 0.34432, "grad_norm": 0.0, "learning_rate": 2.7348900205185668e-06, "loss": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2265625, "completions/mean_terminated_length": 233.83824157714844, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07315133512020111, "epoch": 0.3444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.734507295169694e-06, "loss": 0.0, "num_tokens": 196037748.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07253843545913696, "epoch": 0.34448, "grad_norm": 0.0, "learning_rate": 2.734124500915709e-06, "loss": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.90625, "completions/mean_terminated_length": 237.78823852539062, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06738005951046944, "epoch": 0.34456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.733741637783402e-06, "loss": 0.0, "num_tokens": 196134504.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06600885838270187, "epoch": 0.34464, "grad_norm": 0.0, "learning_rate": 2.733358705799572e-06, "loss": 0.0, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.59375, "completions/mean_terminated_length": 239.94029235839844, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.07232555374503136, "epoch": 0.34472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7329757049910183e-06, "loss": 0.0, "num_tokens": 196231732.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07241057604551315, "epoch": 0.3448, "grad_norm": 0.0, "learning_rate": 2.7325926353845474e-06, "loss": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 197.68421936035156, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07349732145667076, "epoch": 0.34488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.732209497006971e-06, "loss": 0.0, "num_tokens": 196325604.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07639320194721222, "epoch": 0.34496, "grad_norm": 0.0, "learning_rate": 2.731826289885103e-06, "loss": 0.0, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.0234375, "completions/mean_terminated_length": 240.046875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.07161150500178337, "epoch": 0.34504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7314430140457645e-06, "loss": 0.0, "num_tokens": 196422887.0, "reward": 0.4505459666252136, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4505459666252136, "rewards/reward_fn/std": 0.987565279006958, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07225381210446358, "epoch": 0.34512, "grad_norm": 0.0, "learning_rate": 2.7310596695157818e-06, "loss": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.6953125, "completions/mean_terminated_length": 238.88043212890625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06917337328195572, "epoch": 0.3452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.730676256321984e-06, "loss": 0.0, "num_tokens": 196519616.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06768230721354485, "epoch": 0.34528, "grad_norm": 0.0, "learning_rate": 2.730292774491206e-06, "loss": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.3984375, "completions/mean_terminated_length": 233.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.06791594624519348, "epoch": 0.34536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.729909224050288e-06, "loss": 0.0, "num_tokens": 196616563.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06985991820693016, "epoch": 0.34544, "grad_norm": 0.0, "learning_rate": 2.7295256050260733e-06, "loss": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.7109375, "completions/mean_terminated_length": 242.85914611816406, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.08356162905693054, "epoch": 0.34552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7291419174454124e-06, "loss": 0.0, "num_tokens": 196713934.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07896698266267776, "epoch": 0.3456, "grad_norm": 0.0, "learning_rate": 2.728758161335159e-06, "loss": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.3515625, "completions/mean_terminated_length": 216.87628173828125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07445521652698517, "epoch": 0.34568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7283743367221723e-06, "loss": 0.0, "num_tokens": 196808443.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07493777200579643, "epoch": 0.34576, "grad_norm": 0.0, "learning_rate": 2.7279904436333152e-06, "loss": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 208.72727966308594, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07103723660111427, "epoch": 0.34584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7276064820954567e-06, "loss": 0.0, "num_tokens": 196902587.0, "reward": 0.4136883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4136883616447449, "rewards/reward_fn/std": 0.9866312742233276, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07609180733561516, "epoch": 0.34592, "grad_norm": 0.0, "learning_rate": 2.7272224521354703e-06, "loss": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.15625, "completions/mean_terminated_length": 230.08163452148438, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07015813514590263, "epoch": 0.346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7268383537802325e-06, "loss": 0.0, "num_tokens": 196998351.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07037665694952011, "epoch": 0.34608, "grad_norm": 0.0, "learning_rate": 2.7264541870566284e-06, "loss": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.4375, "completions/mean_terminated_length": 209.76470947265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.059017062187194824, "epoch": 0.34616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.726069951991544e-06, "loss": 0.0, "num_tokens": 197093511.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06119959615170956, "epoch": 0.34624, "grad_norm": 0.0, "learning_rate": 2.725685648611873e-06, "loss": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.4296875, "completions/mean_terminated_length": 191.1704559326172, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.0741799809038639, "epoch": 0.34632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7253012769445107e-06, "loss": 0.0, "num_tokens": 197186110.0, "reward": 1.1520648002624512, "reward_std": 0.0, "rewards/reward_fn/mean": 1.1520648002624512, "rewards/reward_fn/std": 1.4386719465255737, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07583839073777199, "epoch": 0.3464, "grad_norm": 0.0, "learning_rate": 2.724916837016361e-06, "loss": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.6796875, "completions/mean_terminated_length": 237.3943634033203, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.07309253141283989, "epoch": 0.34648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.724532328854329e-06, "loss": 0.0, "num_tokens": 197283093.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0780206024646759, "epoch": 0.34656, "grad_norm": 0.0, "learning_rate": 2.7241477524853277e-06, "loss": 0.0, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.0234375, "completions/mean_terminated_length": 203.5500030517578, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.064737968146801, "epoch": 0.34664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.723763107936272e-06, "loss": 0.0, "num_tokens": 197376152.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06542214751243591, "epoch": 0.34672, "grad_norm": 0.0, "learning_rate": 2.7233783952340836e-06, "loss": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.296875, "completions/mean_terminated_length": 190.1445770263672, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06937035918235779, "epoch": 0.3468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7229936144056893e-06, "loss": 0.0, "num_tokens": 197468990.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06776077300310135, "epoch": 0.34688, "grad_norm": 0.0, "learning_rate": 2.722608765478018e-06, "loss": 0.0, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 212.515625, "completions/mean_terminated_length": 208.01724243164062, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07212977483868599, "epoch": 0.34696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.722223848478006e-06, "loss": 0.0, "num_tokens": 197561728.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07356870919466019, "epoch": 0.34704, "grad_norm": 0.0, "learning_rate": 2.7218388634325933e-06, "loss": 0.0, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.5234375, "completions/mean_terminated_length": 243.57142639160156, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.0661088153719902, "epoch": 0.34712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7214538103687245e-06, "loss": 0.0, "num_tokens": 197659075.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0654698982834816, "epoch": 0.3472, "grad_norm": 0.0, "learning_rate": 2.7210686893133496e-06, "loss": 0.0, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.2265625, "completions/mean_terminated_length": 238.09091186523438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06819584220647812, "epoch": 0.34728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7206835002934227e-06, "loss": 0.0, "num_tokens": 197756000.0, "reward": 0.5123593807220459, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5123593807220459, "rewards/reward_fn/std": 0.9918787479400635, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06953367590904236, "epoch": 0.34736, "grad_norm": 0.0, "learning_rate": 2.7202982433359037e-06, "loss": 0.0, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.921875, "completions/mean_terminated_length": 203.25001525878906, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0677659809589386, "epoch": 0.34744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7199129184677554e-06, "loss": 0.0, "num_tokens": 197851350.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06690270826220512, "epoch": 0.34752, "grad_norm": 0.0, "learning_rate": 2.719527525715947e-06, "loss": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 239.3333282470703, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.07084552943706512, "epoch": 0.3476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.719142065107452e-06, "loss": 0.0, "num_tokens": 197948454.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06876695901155472, "epoch": 0.34768, "grad_norm": 0.0, "learning_rate": 2.7187565366692488e-06, "loss": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.578125, "completions/mean_terminated_length": 233.7666778564453, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06603609025478363, "epoch": 0.34776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7183709404283197e-06, "loss": 0.0, "num_tokens": 198045424.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06596511602401733, "epoch": 0.34784, "grad_norm": 0.0, "learning_rate": 2.717985276411653e-06, "loss": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 242.2578125, "completions/mean_terminated_length": 232.5466766357422, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07028673589229584, "epoch": 0.34792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.717599544646241e-06, "loss": 0.0, "num_tokens": 198141969.0, "reward": 0.01492841262370348, "reward_std": 0.0, "rewards/reward_fn/mean": 0.01492841262370348, "rewards/reward_fn/std": 0.039652060717344284, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07237650826573372, "epoch": 0.348, "grad_norm": 0.0, "learning_rate": 2.71721374515908e-06, "loss": 0.0, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.3984375, "completions/mean_terminated_length": 181.30136108398438, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06106545403599739, "epoch": 0.34808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7168278779771734e-06, "loss": 0.0, "num_tokens": 198234820.0, "reward": 0.8683507442474365, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8683507442474365, "rewards/reward_fn/std": 1.2730607986450195, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07181442901492119, "epoch": 0.34816, "grad_norm": 0.0, "learning_rate": 2.716441943127526e-06, "loss": 0.0, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.3984375, "completions/mean_terminated_length": 199.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07041853293776512, "epoch": 0.34824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.716055940637151e-06, "loss": 0.0, "num_tokens": 198329847.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0703485943377018, "epoch": 0.34832, "grad_norm": 0.0, "learning_rate": 2.715669870533063e-06, "loss": 0.0, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 225.70211791992188, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06917368620634079, "epoch": 0.3484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.715283732842284e-06, "loss": 0.0, "num_tokens": 198425303.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07299737632274628, "epoch": 0.34848, "grad_norm": 0.0, "learning_rate": 2.714897527591838e-06, "loss": 0.0, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.8671875, "completions/mean_terminated_length": 235.49122619628906, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.07053486257791519, "epoch": 0.34856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7145112548087576e-06, "loss": 0.0, "num_tokens": 198522438.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06686294078826904, "epoch": 0.34864, "grad_norm": 0.0, "learning_rate": 2.7141249145200756e-06, "loss": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 220.1666717529297, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06860216334462166, "epoch": 0.34872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.713738506752833e-06, "loss": 0.0, "num_tokens": 198617302.0, "reward": 0.06349717825651169, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06349717825651169, "rewards/reward_fn/std": 0.16865785419940948, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.069373220205307, "epoch": 0.3488, "grad_norm": 0.0, "learning_rate": 2.7133520315340745e-06, "loss": 0.0, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.3828125, "completions/mean_terminated_length": 238.159423828125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.06754355505108833, "epoch": 0.34888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7129654888908477e-06, "loss": 0.0, "num_tokens": 198714375.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06815873086452484, "epoch": 0.34896, "grad_norm": 0.0, "learning_rate": 2.712578878850208e-06, "loss": 0.0, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.203125, "completions/mean_terminated_length": 236.40625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06900188326835632, "epoch": 0.34904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7121922014392137e-06, "loss": 0.0, "num_tokens": 198811425.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674760453402996, "epoch": 0.34912, "grad_norm": 0.0, "learning_rate": 2.7118054566849277e-06, "loss": 0.0, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.578125, "completions/mean_terminated_length": 209.8157958984375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06948860734701157, "epoch": 0.3492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7114186446144185e-06, "loss": 0.0, "num_tokens": 198906219.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07062838599085808, "epoch": 0.34928, "grad_norm": 0.0, "learning_rate": 2.7110317652547586e-06, "loss": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1796875, "completions/mean_terminated_length": 232.5978240966797, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07261603325605392, "epoch": 0.34936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.710644818633025e-06, "loss": 0.0, "num_tokens": 199002370.0, "reward": 0.05776464566588402, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05776464566588402, "rewards/reward_fn/std": 0.15343140065670013, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06968609243631363, "epoch": 0.34944, "grad_norm": 0.0, "learning_rate": 2.7102578047763014e-06, "loss": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.484375, "completions/mean_terminated_length": 237.37208557128906, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06921213120222092, "epoch": 0.34952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.709870723711673e-06, "loss": 0.0, "num_tokens": 199099072.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06636394560337067, "epoch": 0.3496, "grad_norm": 0.0, "learning_rate": 2.7094835754662326e-06, "loss": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.3203125, "completions/mean_terminated_length": 229.46971130371094, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.06680630892515182, "epoch": 0.34968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7090963600670756e-06, "loss": 0.0, "num_tokens": 199195625.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06468784064054489, "epoch": 0.34976, "grad_norm": 0.0, "learning_rate": 2.7087090775413034e-06, "loss": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 219.859375, "completions/mean_terminated_length": 205.1648406982422, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0732276700437069, "epoch": 0.34984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.708321727916022e-06, "loss": 0.0, "num_tokens": 199289303.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07659724354743958, "epoch": 0.34992, "grad_norm": 0.0, "learning_rate": 2.707934311218341e-06, "loss": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 226.46153259277344, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06715116277337074, "epoch": 0.35, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.707546827475376e-06, "loss": 0.0, "num_tokens": 199385687.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06739633157849312, "epoch": 0.35008, "grad_norm": 0.0, "learning_rate": 2.7071592767142476e-06, "loss": 0.0, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.6328125, "completions/mean_terminated_length": 234.59677124023438, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07193483039736748, "epoch": 0.35016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7067716589620778e-06, "loss": 0.0, "num_tokens": 199482664.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07091112434864044, "epoch": 0.35024, "grad_norm": 0.0, "learning_rate": 2.706383974245998e-06, "loss": 0.0, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.21875, "completions/mean_terminated_length": 209.09524536132812, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06773584708571434, "epoch": 0.35032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7059962225931412e-06, "loss": 0.0, "num_tokens": 199577028.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06745768338441849, "epoch": 0.3504, "grad_norm": 0.0, "learning_rate": 2.7056084040306465e-06, "loss": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.6328125, "completions/mean_terminated_length": 240.68421936035156, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06617870181798935, "epoch": 0.35048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7052205185856564e-06, "loss": 0.0, "num_tokens": 199673877.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06789600849151611, "epoch": 0.35056, "grad_norm": 0.0, "learning_rate": 2.704832566285319e-06, "loss": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.015625, "completions/mean_terminated_length": 205.8235321044922, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06725630164146423, "epoch": 0.35064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.704444547156787e-06, "loss": 0.0, "num_tokens": 199767063.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06388882920145988, "epoch": 0.35072, "grad_norm": 0.0, "learning_rate": 2.7040564612272174e-06, "loss": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 236.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.07506218552589417, "epoch": 0.3508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7036683085237724e-06, "loss": 0.0, "num_tokens": 199863495.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0751306489109993, "epoch": 0.35088, "grad_norm": 0.0, "learning_rate": 2.703280089073618e-06, "loss": 0.0, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.1484375, "completions/mean_terminated_length": 205.6666717529297, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.062004245817661285, "epoch": 0.35096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.702891802903926e-06, "loss": 0.0, "num_tokens": 199957722.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06219092383980751, "epoch": 0.35104, "grad_norm": 0.0, "learning_rate": 2.7025034500418726e-06, "loss": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 205.0078125, "completions/mean_terminated_length": 176.40243530273438, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07150856778025627, "epoch": 0.35112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7021150305146378e-06, "loss": 0.0, "num_tokens": 200049499.0, "reward": 1.875, "reward_std": 0.0, "rewards/reward_fn/mean": 1.875, "rewards/reward_fn/std": 1.4580755233764648, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07128870114684105, "epoch": 0.3512, "grad_norm": 0.0, "learning_rate": 2.701726544349407e-06, "loss": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 216.0800018310547, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.0639876052737236, "epoch": 0.35128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7013379915733707e-06, "loss": 0.0, "num_tokens": 200143811.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06195499375462532, "epoch": 0.35136, "grad_norm": 0.0, "learning_rate": 2.7009493722137227e-06, "loss": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.296875, "completions/mean_terminated_length": 209.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07285471260547638, "epoch": 0.35144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.7005606862976626e-06, "loss": 0.0, "num_tokens": 200238825.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0707908347249031, "epoch": 0.35152, "grad_norm": 0.0, "learning_rate": 2.7001719338523944e-06, "loss": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 208.09375, "completions/mean_terminated_length": 183.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07032085955142975, "epoch": 0.3516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6997831149051266e-06, "loss": 0.0, "num_tokens": 200330997.0, "reward": 0.45364314317703247, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45364314317703247, "rewards/reward_fn/std": 0.9880856871604919, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.062361547723412514, "epoch": 0.35168, "grad_norm": 0.0, "learning_rate": 2.699394229483072e-06, "loss": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.3203125, "completions/mean_terminated_length": 219.253173828125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.08148352801799774, "epoch": 0.35176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.699005277613449e-06, "loss": 0.0, "num_tokens": 200426398.0, "reward": 0.11646917462348938, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11646917462348938, "rewards/reward_fn/std": 0.23321440815925598, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0761711597442627, "epoch": 0.35184, "grad_norm": 0.0, "learning_rate": 2.69861625932348e-06, "loss": 0.0, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.3359375, "completions/mean_terminated_length": 203.7375030517578, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06379713863134384, "epoch": 0.35192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.698227174640392e-06, "loss": 0.0, "num_tokens": 200520521.0, "reward": 0.08572613447904587, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08572613447904587, "rewards/reward_fn/std": 0.22770123183727264, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06724605709314346, "epoch": 0.352, "grad_norm": 0.0, "learning_rate": 2.6978380235914172e-06, "loss": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.7578125, "completions/mean_terminated_length": 196.9058837890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07504111528396606, "epoch": 0.35208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6974488062037915e-06, "loss": 0.0, "num_tokens": 200613802.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07734327390789986, "epoch": 0.35216, "grad_norm": 0.0, "learning_rate": 2.6970595225047566e-06, "loss": 0.0, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.34375, "completions/mean_terminated_length": 230.60675048828125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.0633419118821621, "epoch": 0.35224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6966701725215577e-06, "loss": 0.0, "num_tokens": 200709846.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06460186094045639, "epoch": 0.35232, "grad_norm": 0.0, "learning_rate": 2.6962807562814457e-06, "loss": 0.0, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.2734375, "completions/mean_terminated_length": 218.26126098632812, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.08233947306871414, "epoch": 0.3524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.695891273811675e-06, "loss": 0.0, "num_tokens": 200803961.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07727901637554169, "epoch": 0.35248, "grad_norm": 0.0, "learning_rate": 2.695501725139506e-06, "loss": 0.0, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.046875, "completions/mean_terminated_length": 241.40982055664062, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06917602196335793, "epoch": 0.35256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6951121102922026e-06, "loss": 0.0, "num_tokens": 200901375.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06913447752594948, "epoch": 0.35264, "grad_norm": 0.0, "learning_rate": 2.694722429297033e-06, "loss": 0.0, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.15625, "completions/mean_terminated_length": 229.61111450195312, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.07395555451512337, "epoch": 0.35272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6943326821812726e-06, "loss": 0.0, "num_tokens": 200997779.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07360044121742249, "epoch": 0.3528, "grad_norm": 0.0, "learning_rate": 2.6939428689721984e-06, "loss": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.203125, "completions/mean_terminated_length": 231.8426971435547, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.07086547464132309, "epoch": 0.35288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.693552989697093e-06, "loss": 0.0, "num_tokens": 201093933.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06736836954951286, "epoch": 0.35296, "grad_norm": 0.0, "learning_rate": 2.693163044383244e-06, "loss": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.640625, "completions/mean_terminated_length": 237.41305541992188, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07654951885342598, "epoch": 0.35304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6927730330579436e-06, "loss": 0.0, "num_tokens": 201190527.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07359690219163895, "epoch": 0.35312, "grad_norm": 0.0, "learning_rate": 2.6923829557484882e-06, "loss": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 219.8709716796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07121764495968819, "epoch": 0.3532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6919928124821803e-06, "loss": 0.0, "num_tokens": 201285471.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06576160341501236, "epoch": 0.35328, "grad_norm": 0.0, "learning_rate": 2.691602603286324e-06, "loss": 0.0, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.5078125, "completions/mean_terminated_length": 216.20730590820312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06277382373809814, "epoch": 0.35336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6912123281882306e-06, "loss": 0.0, "num_tokens": 201380512.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06359495967626572, "epoch": 0.35344, "grad_norm": 0.0, "learning_rate": 2.6908219872152158e-06, "loss": 0.0, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 246.3671875, "completions/mean_terminated_length": 239.3378448486328, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06310934387147427, "epoch": 0.35352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6904315803945982e-06, "loss": 0.0, "num_tokens": 201477583.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06267191469669342, "epoch": 0.3536, "grad_norm": 0.0, "learning_rate": 2.6900411077537033e-06, "loss": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.671875, "completions/mean_terminated_length": 216.46340942382812, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.07801331207156181, "epoch": 0.35368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.68965056931986e-06, "loss": 0.0, "num_tokens": 201572645.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08200563862919807, "epoch": 0.35376, "grad_norm": 0.0, "learning_rate": 2.689259965120401e-06, "loss": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.8359375, "completions/mean_terminated_length": 211.54205322265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07093122601509094, "epoch": 0.35384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6888692951826647e-06, "loss": 0.0, "num_tokens": 201666192.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07144191488623619, "epoch": 0.35392, "grad_norm": 0.0, "learning_rate": 2.6884785595339937e-06, "loss": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.8203125, "completions/mean_terminated_length": 207.49349975585938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07685241848230362, "epoch": 0.354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6880877582017366e-06, "loss": 0.0, "num_tokens": 201760761.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07014959305524826, "epoch": 0.35408, "grad_norm": 0.0, "learning_rate": 2.687696891213244e-06, "loss": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.21875, "completions/mean_terminated_length": 204.31683349609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06412648595869541, "epoch": 0.35416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.687305958595873e-06, "loss": 0.0, "num_tokens": 201853845.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07133805751800537, "epoch": 0.35424, "grad_norm": 0.0, "learning_rate": 2.6869149603769853e-06, "loss": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.046875, "completions/mean_terminated_length": 221.65591430664062, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06733471900224686, "epoch": 0.35432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6865238965839457e-06, "loss": 0.0, "num_tokens": 201948955.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06889478117227554, "epoch": 0.3544, "grad_norm": 0.0, "learning_rate": 2.6861327672441246e-06, "loss": 0.0, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 237.05746459960938, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06985775381326675, "epoch": 0.35448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6857415723848973e-06, "loss": 0.0, "num_tokens": 202045611.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06728745996952057, "epoch": 0.35456, "grad_norm": 0.0, "learning_rate": 2.6853503120336436e-06, "loss": 0.0, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.2578125, "completions/mean_terminated_length": 220.60000610351562, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07291939482092857, "epoch": 0.35464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.684958986217747e-06, "loss": 0.0, "num_tokens": 202141260.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06848574057221413, "epoch": 0.35472, "grad_norm": 0.0, "learning_rate": 2.684567594964596e-06, "loss": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.7578125, "completions/mean_terminated_length": 239.10000610351562, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06781962886452675, "epoch": 0.3548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6841761383015845e-06, "loss": 0.0, "num_tokens": 202238381.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06854922696948051, "epoch": 0.35488, "grad_norm": 0.0, "learning_rate": 2.6837846162561107e-06, "loss": 0.0, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.3125, "completions/mean_terminated_length": 228.18605041503906, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.07528501003980637, "epoch": 0.35496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.683393028855576e-06, "loss": 0.0, "num_tokens": 202334293.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07313055172562599, "epoch": 0.35504, "grad_norm": 0.0, "learning_rate": 2.6830013761273873e-06, "loss": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.7578125, "completions/mean_terminated_length": 202.67816162109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07190744578838348, "epoch": 0.35512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.682609658098957e-06, "loss": 0.0, "num_tokens": 202427958.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07672129198908806, "epoch": 0.3552, "grad_norm": 0.0, "learning_rate": 2.6822178747977006e-06, "loss": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.0078125, "completions/mean_terminated_length": 229.06317138671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06690993905067444, "epoch": 0.35528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.68182602625104e-06, "loss": 0.0, "num_tokens": 202523703.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07078203558921814, "epoch": 0.35536, "grad_norm": 0.0, "learning_rate": 2.6814341124863984e-06, "loss": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.2578125, "completions/mean_terminated_length": 220.56988525390625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07124993950128555, "epoch": 0.35544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6810421335312075e-06, "loss": 0.0, "num_tokens": 202618712.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07140221446752548, "epoch": 0.35552, "grad_norm": 0.0, "learning_rate": 2.6806500894129003e-06, "loss": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.9453125, "completions/mean_terminated_length": 226.49424743652344, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.07424819841980934, "epoch": 0.3556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6802579801589165e-06, "loss": 0.0, "num_tokens": 202714449.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07409375160932541, "epoch": 0.35568, "grad_norm": 0.0, "learning_rate": 2.6798658057966996e-06, "loss": 0.0, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 236.57142639160156, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06905698031187057, "epoch": 0.35576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6794735663536976e-06, "loss": 0.0, "num_tokens": 202811393.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06739000231027603, "epoch": 0.35584, "grad_norm": 0.0, "learning_rate": 2.679081261857363e-06, "loss": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.7421875, "completions/mean_terminated_length": 218.07691955566406, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07382945716381073, "epoch": 0.35592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.678688892335153e-06, "loss": 0.0, "num_tokens": 202907232.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07322215288877487, "epoch": 0.356, "grad_norm": 0.0, "learning_rate": 2.6782964578145293e-06, "loss": 0.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 162.58824157714844, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.07444365695118904, "epoch": 0.35608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6779039583229584e-06, "loss": 0.0, "num_tokens": 202996008.0, "reward": 0.8748345375061035, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8748345375061035, "rewards/reward_fn/std": 1.2735799551010132, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0717884786427021, "epoch": 0.35616, "grad_norm": 0.0, "learning_rate": 2.677511393887911e-06, "loss": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 172.6484375, "completions/mean_terminated_length": 159.8828887939453, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07356225326657295, "epoch": 0.35624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.677118764536862e-06, "loss": 0.0, "num_tokens": 203083643.0, "reward": 0.7864140868186951, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7864140868186951, "rewards/reward_fn/std": 1.2865034341812134, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07283508777618408, "epoch": 0.35632, "grad_norm": 0.0, "learning_rate": 2.6767260702972917e-06, "loss": 0.0, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.5859375, "completions/mean_terminated_length": 222.97589111328125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.0695224329829216, "epoch": 0.3564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6763333111966844e-06, "loss": 0.0, "num_tokens": 203179206.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674804225564003, "epoch": 0.35648, "grad_norm": 0.0, "learning_rate": 2.67594048726253e-06, "loss": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 205.36471557617188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06328358687460423, "epoch": 0.35656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.675547598522321e-06, "loss": 0.0, "num_tokens": 203273206.0, "reward": 0.3948310613632202, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3948310613632202, "rewards/reward_fn/std": 0.9899041056632996, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061225298792123795, "epoch": 0.35664, "grad_norm": 0.0, "learning_rate": 2.6751546450035558e-06, "loss": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 223.3671875, "completions/mean_terminated_length": 210.09890747070312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06989486515522003, "epoch": 0.35672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.674761626733737e-06, "loss": 0.0, "num_tokens": 203367333.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07233558222651482, "epoch": 0.3568, "grad_norm": 0.0, "learning_rate": 2.6743685437403713e-06, "loss": 0.0, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 226.13514709472656, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06944680213928223, "epoch": 0.35688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6739753960509705e-06, "loss": 0.0, "num_tokens": 203463427.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06900215893983841, "epoch": 0.35696, "grad_norm": 0.0, "learning_rate": 2.6735821836930512e-06, "loss": 0.0, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.3046875, "completions/mean_terminated_length": 196.5978240966797, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06814052909612656, "epoch": 0.35704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.673188906694134e-06, "loss": 0.0, "num_tokens": 203556266.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06853550672531128, "epoch": 0.35712, "grad_norm": 0.0, "learning_rate": 2.6727955650817446e-06, "loss": 0.0, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 209.5483856201172, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06261071003973484, "epoch": 0.3572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6724021588834113e-06, "loss": 0.0, "num_tokens": 203651690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061257390305399895, "epoch": 0.35728, "grad_norm": 0.0, "learning_rate": 2.6720086881266697e-06, "loss": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.2265625, "completions/mean_terminated_length": 194.79747009277344, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07449610531330109, "epoch": 0.35736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.671615152839058e-06, "loss": 0.0, "num_tokens": 203745159.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07171539962291718, "epoch": 0.35744, "grad_norm": 0.0, "learning_rate": 2.67122155304812e-06, "loss": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 240.5245819091797, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07626023516058922, "epoch": 0.35752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.670827888781403e-06, "loss": 0.0, "num_tokens": 203842519.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07944514974951744, "epoch": 0.3576, "grad_norm": 0.0, "learning_rate": 2.67043416006646e-06, "loss": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.1796875, "completions/mean_terminated_length": 221.83871459960938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07386019080877304, "epoch": 0.35768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.670040366930847e-06, "loss": 0.0, "num_tokens": 203937646.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.074351005256176, "epoch": 0.35776, "grad_norm": 0.0, "learning_rate": 2.669646509402126e-06, "loss": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.703125, "completions/mean_terminated_length": 219.7049102783203, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06917237117886543, "epoch": 0.35784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6692525875078624e-06, "loss": 0.0, "num_tokens": 204033736.0, "reward": 0.05776464566588402, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05776464566588402, "rewards/reward_fn/std": 0.15343140065670013, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06917553395032883, "epoch": 0.35792, "grad_norm": 0.0, "learning_rate": 2.6688586012756267e-06, "loss": 0.0, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.796875, "completions/mean_terminated_length": 194.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06758605316281319, "epoch": 0.358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.668464550732994e-06, "loss": 0.0, "num_tokens": 204126894.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07342928647994995, "epoch": 0.35808, "grad_norm": 0.0, "learning_rate": 2.668070435907544e-06, "loss": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 233.76271057128906, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06731738895177841, "epoch": 0.35816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6676762568268593e-06, "loss": 0.0, "num_tokens": 204223886.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06812215223908424, "epoch": 0.35824, "grad_norm": 0.0, "learning_rate": 2.6672820135185296e-06, "loss": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.6640625, "completions/mean_terminated_length": 219.84722900390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0721907690167427, "epoch": 0.35832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6668877060101474e-06, "loss": 0.0, "num_tokens": 204319587.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07321861386299133, "epoch": 0.3584, "grad_norm": 0.0, "learning_rate": 2.6664933343293094e-06, "loss": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.140625, "completions/mean_terminated_length": 206.84402465820312, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07848668470978737, "epoch": 0.35848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.666098898503619e-06, "loss": 0.0, "num_tokens": 204412533.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08053873106837273, "epoch": 0.35856, "grad_norm": 0.0, "learning_rate": 2.665704398560681e-06, "loss": 0.0, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.4140625, "completions/mean_terminated_length": 221.758056640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.06288637965917587, "epoch": 0.35864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.665309834528107e-06, "loss": 0.0, "num_tokens": 204508714.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0646429993212223, "epoch": 0.35872, "grad_norm": 0.0, "learning_rate": 2.6649152064335115e-06, "loss": 0.0, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.1953125, "completions/mean_terminated_length": 236.87342834472656, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06549923121929169, "epoch": 0.3588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6645205143045153e-06, "loss": 0.0, "num_tokens": 204605507.0, "reward": 0.08953723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08953723311424255, "rewards/reward_fn/std": 0.23782405257225037, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06527974084019661, "epoch": 0.35888, "grad_norm": 0.0, "learning_rate": 2.6641257581687418e-06, "loss": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.6015625, "completions/mean_terminated_length": 210.52127075195312, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07392389327287674, "epoch": 0.35896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6637309380538207e-06, "loss": 0.0, "num_tokens": 204699536.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07187139615416527, "epoch": 0.35904, "grad_norm": 0.0, "learning_rate": 2.663336053987385e-06, "loss": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 220.84506225585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07455155998468399, "epoch": 0.35912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.662941105997071e-06, "loss": 0.0, "num_tokens": 204795344.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.074380774050951, "epoch": 0.3592, "grad_norm": 0.0, "learning_rate": 2.662546094110524e-06, "loss": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.390625, "completions/mean_terminated_length": 234.7446746826172, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06628698855638504, "epoch": 0.35928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.662151018355387e-06, "loss": 0.0, "num_tokens": 204891650.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0664052665233612, "epoch": 0.35936, "grad_norm": 0.0, "learning_rate": 2.6617558787593137e-06, "loss": 0.0, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.8828125, "completions/mean_terminated_length": 223.4835205078125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06801201403141022, "epoch": 0.35944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6613606753499582e-06, "loss": 0.0, "num_tokens": 204986995.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06695319712162018, "epoch": 0.35952, "grad_norm": 0.0, "learning_rate": 2.660965408154982e-06, "loss": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.765625, "completions/mean_terminated_length": 204.30357360839844, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0765443667769432, "epoch": 0.3596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6605700772020485e-06, "loss": 0.0, "num_tokens": 205079509.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07707170024514198, "epoch": 0.35968, "grad_norm": 0.0, "learning_rate": 2.660174682518827e-06, "loss": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.953125, "completions/mean_terminated_length": 196.02127075195312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07369958609342575, "epoch": 0.35976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.659779224132991e-06, "loss": 0.0, "num_tokens": 205172175.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07269398123025894, "epoch": 0.35984, "grad_norm": 0.0, "learning_rate": 2.659383702072218e-06, "loss": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 239.21310424804688, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07348312065005302, "epoch": 0.35992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.658988116364191e-06, "loss": 0.0, "num_tokens": 205269455.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102466747164726, "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 2.658592467036597e-06, "loss": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.3359375, "completions/mean_terminated_length": 235.48101806640625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07178085297346115, "epoch": 0.36008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6581967541171265e-06, "loss": 0.0, "num_tokens": 205366138.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07140370458364487, "epoch": 0.36016, "grad_norm": 0.0, "learning_rate": 2.6578009776334754e-06, "loss": 0.0, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.4296875, "completions/mean_terminated_length": 205.53334045410156, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.059416377916932106, "epoch": 0.36024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6574051376133447e-06, "loss": 0.0, "num_tokens": 205460657.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06171926483511925, "epoch": 0.36032, "grad_norm": 0.0, "learning_rate": 2.657009234084438e-06, "loss": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 249.4921875, "completions/mean_terminated_length": 241.88136291503906, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06556441262364388, "epoch": 0.3604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6566132670744644e-06, "loss": 0.0, "num_tokens": 205558128.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06452962383627892, "epoch": 0.36048, "grad_norm": 0.0, "learning_rate": 2.6562172366111384e-06, "loss": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 240.90625, "completions/mean_terminated_length": 226.27691650390625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.08381461352109909, "epoch": 0.36056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6558211427221768e-06, "loss": 0.0, "num_tokens": 205654500.0, "reward": 0.10954097658395767, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10954097658395767, "rewards/reward_fn/std": 0.19051921367645264, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08206979557871819, "epoch": 0.36064, "grad_norm": 0.0, "learning_rate": 2.655424985435303e-06, "loss": 0.0, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 196.34286499023438, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07136588171124458, "epoch": 0.36072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6550287647782426e-06, "loss": 0.0, "num_tokens": 205748628.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07214641198515892, "epoch": 0.3608, "grad_norm": 0.0, "learning_rate": 2.6546324807787283e-06, "loss": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.7578125, "completions/mean_terminated_length": 232.98130798339844, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06633442640304565, "epoch": 0.36088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6542361334644947e-06, "loss": 0.0, "num_tokens": 205844469.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06814690679311752, "epoch": 0.36096, "grad_norm": 0.0, "learning_rate": 2.6538397228632822e-06, "loss": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.015625, "completions/mean_terminated_length": 220.6666717529297, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06462019309401512, "epoch": 0.36104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6534432490028354e-06, "loss": 0.0, "num_tokens": 205939063.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06601566448807716, "epoch": 0.36112, "grad_norm": 0.0, "learning_rate": 2.6530467119109035e-06, "loss": 0.0, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.9296875, "completions/mean_terminated_length": 233.39561462402344, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06717284768819809, "epoch": 0.3612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.65265011161524e-06, "loss": 0.0, "num_tokens": 206035310.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06747737154364586, "epoch": 0.36128, "grad_norm": 0.0, "learning_rate": 2.6522534481436023e-06, "loss": 0.0, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6171875, "completions/mean_terminated_length": 234.92982482910156, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.0725727528333664, "epoch": 0.36136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6518567215237522e-06, "loss": 0.0, "num_tokens": 206132413.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07520400732755661, "epoch": 0.36144, "grad_norm": 0.0, "learning_rate": 2.651459931783458e-06, "loss": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.3984375, "completions/mean_terminated_length": 237.89022827148438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.0706498958170414, "epoch": 0.36152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6510630789504896e-06, "loss": 0.0, "num_tokens": 206229232.0, "reward": 0.4936048090457916, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4936048090457916, "rewards/reward_fn/std": 1.0008580684661865, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0701904222369194, "epoch": 0.3616, "grad_norm": 0.0, "learning_rate": 2.650666163052622e-06, "loss": 0.0, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.5703125, "completions/mean_terminated_length": 218.32528686523438, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07833962142467499, "epoch": 0.36168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6502691841176362e-06, "loss": 0.0, "num_tokens": 206324409.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07701592147350311, "epoch": 0.36176, "grad_norm": 0.0, "learning_rate": 2.649872142173316e-06, "loss": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.359375, "completions/mean_terminated_length": 213.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06849270313978195, "epoch": 0.36184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6494750372474507e-06, "loss": 0.0, "num_tokens": 206418663.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06625410169363022, "epoch": 0.36192, "grad_norm": 0.0, "learning_rate": 2.6490778693678332e-06, "loss": 0.0, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 209.3913116455078, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06822102144360542, "epoch": 0.362, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.648680638562261e-06, "loss": 0.0, "num_tokens": 206512679.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06622133031487465, "epoch": 0.36208, "grad_norm": 0.0, "learning_rate": 2.6482833448585347e-06, "loss": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.9140625, "completions/mean_terminated_length": 188.44085693359375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07498282194137573, "epoch": 0.36216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.647885988284463e-06, "loss": 0.0, "num_tokens": 206604700.0, "reward": 0.8231669664382935, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8231669664382935, "rewards/reward_fn/std": 1.2697646617889404, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07428338378667831, "epoch": 0.36224, "grad_norm": 0.0, "learning_rate": 2.647488568867855e-06, "loss": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.671875, "completions/mean_terminated_length": 218.3023223876953, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07295270636677742, "epoch": 0.36232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.647091086636527e-06, "loss": 0.0, "num_tokens": 206699762.0, "reward": 0.49978116154670715, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49978116154670715, "rewards/reward_fn/std": 1.0038195848464966, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0679762102663517, "epoch": 0.3624, "grad_norm": 0.0, "learning_rate": 2.646693541618298e-06, "loss": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.65625, "completions/mean_terminated_length": 241.75758361816406, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07235788181424141, "epoch": 0.36248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6462959338409914e-06, "loss": 0.0, "num_tokens": 206797126.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07295550033450127, "epoch": 0.36256, "grad_norm": 0.0, "learning_rate": 2.645898263332437e-06, "loss": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.0703125, "completions/mean_terminated_length": 219.71580505371094, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06541026756167412, "epoch": 0.36264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.645500530120466e-06, "loss": 0.0, "num_tokens": 206891983.0, "reward": 0.39967191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.39967191219329834, "rewards/reward_fn/std": 0.988822877407074, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06694671139121056, "epoch": 0.36272, "grad_norm": 0.0, "learning_rate": 2.6451027342329157e-06, "loss": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.703125, "completions/mean_terminated_length": 210.1720428466797, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06998515129089355, "epoch": 0.3628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.644704875697629e-06, "loss": 0.0, "num_tokens": 206986025.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06907832995057106, "epoch": 0.36288, "grad_norm": 0.0, "learning_rate": 2.6443069545424503e-06, "loss": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.3125, "completions/mean_terminated_length": 206.29507446289062, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07159155607223511, "epoch": 0.36296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.643908970795231e-06, "loss": 0.0, "num_tokens": 207081297.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06986550614237785, "epoch": 0.36304, "grad_norm": 0.0, "learning_rate": 2.643510924483825e-06, "loss": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.078125, "completions/mean_terminated_length": 237.8441619873047, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07376578077673912, "epoch": 0.36312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6431128156360906e-06, "loss": 0.0, "num_tokens": 207178203.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07361605018377304, "epoch": 0.3632, "grad_norm": 0.0, "learning_rate": 2.642714644279893e-06, "loss": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.3203125, "completions/mean_terminated_length": 229.68605041503906, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06122069992125034, "epoch": 0.36328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.642316410443099e-06, "loss": 0.0, "num_tokens": 207274244.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059670547023415565, "epoch": 0.36336, "grad_norm": 0.0, "learning_rate": 2.641918114153582e-06, "loss": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.1484375, "completions/mean_terminated_length": 232.8170623779297, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06625914946198463, "epoch": 0.36344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.641519755439216e-06, "loss": 0.0, "num_tokens": 207370647.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06927366554737091, "epoch": 0.36352, "grad_norm": 0.0, "learning_rate": 2.641121334327884e-06, "loss": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.8359375, "completions/mean_terminated_length": 198.4479217529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07310013845562935, "epoch": 0.3636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.64072285084747e-06, "loss": 0.0, "num_tokens": 207463426.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0708707645535469, "epoch": 0.36368, "grad_norm": 0.0, "learning_rate": 2.640324305025865e-06, "loss": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.2734375, "completions/mean_terminated_length": 211.85716247558594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07449451833963394, "epoch": 0.36376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6399256968909613e-06, "loss": 0.0, "num_tokens": 207558949.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07283501699566841, "epoch": 0.36384, "grad_norm": 0.0, "learning_rate": 2.6395270264706592e-06, "loss": 0.0, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.7265625, "completions/mean_terminated_length": 185.83824157714844, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06933112442493439, "epoch": 0.36392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6391282937928604e-06, "loss": 0.0, "num_tokens": 207652482.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06855570897459984, "epoch": 0.364, "grad_norm": 0.0, "learning_rate": 2.6387294988854716e-06, "loss": 0.0, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.640625, "completions/mean_terminated_length": 233.95790100097656, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07399911805987358, "epoch": 0.36408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6383306417764047e-06, "loss": 0.0, "num_tokens": 207748692.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06752292066812515, "epoch": 0.36416, "grad_norm": 0.0, "learning_rate": 2.6379317224935753e-06, "loss": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.1640625, "completions/mean_terminated_length": 232.1287078857422, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.07052330672740936, "epoch": 0.36424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6375327410649033e-06, "loss": 0.0, "num_tokens": 207844585.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06981751322746277, "epoch": 0.36432, "grad_norm": 0.0, "learning_rate": 2.637133697518314e-06, "loss": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 245.4296875, "completions/mean_terminated_length": 233.45001220703125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.0816911906003952, "epoch": 0.3644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6367345918817366e-06, "loss": 0.0, "num_tokens": 207941536.0, "reward": 0.07711366564035416, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07711366564035416, "rewards/reward_fn/std": 0.2048252522945404, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07899836078286171, "epoch": 0.36448, "grad_norm": 0.0, "learning_rate": 2.6363354241831025e-06, "loss": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 215.8984375, "completions/mean_terminated_length": 197.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07570269331336021, "epoch": 0.36456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6359361944503503e-06, "loss": 0.0, "num_tokens": 208034707.0, "reward": 0.49999886751174927, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49999886751174927, "rewards/reward_fn/std": 1.003928780555725, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07785187661647797, "epoch": 0.36464, "grad_norm": 0.0, "learning_rate": 2.635536902711422e-06, "loss": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.6171875, "completions/mean_terminated_length": 237.13095092773438, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06385984271764755, "epoch": 0.36472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.635137548994263e-06, "loss": 0.0, "num_tokens": 208131426.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06528623029589653, "epoch": 0.3648, "grad_norm": 0.0, "learning_rate": 2.634738133326825e-06, "loss": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 202.7734375, "completions/mean_terminated_length": 191.11428833007812, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07277465611696243, "epoch": 0.36488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.634338655737062e-06, "loss": 0.0, "num_tokens": 208222917.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0772884227335453, "epoch": 0.36496, "grad_norm": 0.0, "learning_rate": 2.633939116252933e-06, "loss": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 216.0800018310547, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.070482037961483, "epoch": 0.36504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6335395149024025e-06, "loss": 0.0, "num_tokens": 208317229.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07079609110951424, "epoch": 0.36512, "grad_norm": 0.0, "learning_rate": 2.6331398517134386e-06, "loss": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.9921875, "completions/mean_terminated_length": 235.4107208251953, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07208405435085297, "epoch": 0.3652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6327401267140114e-06, "loss": 0.0, "num_tokens": 208414380.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07129979133605957, "epoch": 0.36528, "grad_norm": 0.0, "learning_rate": 2.6323403399320997e-06, "loss": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.578125, "completions/mean_terminated_length": 235.2692413330078, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07027801126241684, "epoch": 0.36536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6319404913956836e-06, "loss": 0.0, "num_tokens": 208511606.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07447362318634987, "epoch": 0.36544, "grad_norm": 0.0, "learning_rate": 2.6315405811327473e-06, "loss": 0.0, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.328125, "completions/mean_terminated_length": 229.54928588867188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06619331613183022, "epoch": 0.36552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6311406091712814e-06, "loss": 0.0, "num_tokens": 208608032.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06386059150099754, "epoch": 0.3656, "grad_norm": 0.0, "learning_rate": 2.6307405755392804e-06, "loss": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.203125, "completions/mean_terminated_length": 221.8709716796875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07151605188846588, "epoch": 0.36568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6303404802647405e-06, "loss": 0.0, "num_tokens": 208703162.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045819237828255, "epoch": 0.36576, "grad_norm": 0.0, "learning_rate": 2.629940323375666e-06, "loss": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.640625, "completions/mean_terminated_length": 218.0898895263672, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07059894502162933, "epoch": 0.36584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.629540104900062e-06, "loss": 0.0, "num_tokens": 208798092.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07116322219371796, "epoch": 0.36592, "grad_norm": 0.0, "learning_rate": 2.629139824865941e-06, "loss": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.140625, "completions/mean_terminated_length": 216.22500610351562, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06635593622922897, "epoch": 0.366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.628739483301317e-06, "loss": 0.0, "num_tokens": 208893214.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273931503296, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06943469867110252, "epoch": 0.36608, "grad_norm": 0.0, "learning_rate": 2.6283390802342103e-06, "loss": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.109375, "completions/mean_terminated_length": 226.73562622070312, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.06955181062221527, "epoch": 0.36616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6279386156926462e-06, "loss": 0.0, "num_tokens": 208988972.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07124875858426094, "epoch": 0.36624, "grad_norm": 0.0, "learning_rate": 2.627538089704651e-06, "loss": 0.0, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.0390625, "completions/mean_terminated_length": 222.36842346191406, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0714670792222023, "epoch": 0.36632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.627137502298258e-06, "loss": 0.0, "num_tokens": 209084081.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07090839371085167, "epoch": 0.3664, "grad_norm": 0.0, "learning_rate": 2.626736853501504e-06, "loss": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 218.4888916015625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07161556929349899, "epoch": 0.36648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.626336143342431e-06, "loss": 0.0, "num_tokens": 209179009.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07102853059768677, "epoch": 0.36656, "grad_norm": 0.0, "learning_rate": 2.6259353718490834e-06, "loss": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.0390625, "completions/mean_terminated_length": 206.60227966308594, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0762900784611702, "epoch": 0.36664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6255345390495118e-06, "loss": 0.0, "num_tokens": 209272966.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0720757395029068, "epoch": 0.36672, "grad_norm": 0.0, "learning_rate": 2.6251336449717697e-06, "loss": 0.0, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.171875, "completions/mean_terminated_length": 213.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06748795881867409, "epoch": 0.3668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6247326896439152e-06, "loss": 0.0, "num_tokens": 209368604.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07059549167752266, "epoch": 0.36688, "grad_norm": 0.0, "learning_rate": 2.6243316730940114e-06, "loss": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.6953125, "completions/mean_terminated_length": 222.09259033203125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.0666879154741764, "epoch": 0.36696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.623930595350125e-06, "loss": 0.0, "num_tokens": 209465077.0, "reward": 0.03868836537003517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03868836537003517, "rewards/reward_fn/std": 0.10276199877262115, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06607925146818161, "epoch": 0.36704, "grad_norm": 0.0, "learning_rate": 2.6235294564403276e-06, "loss": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.1328125, "completions/mean_terminated_length": 236.6571502685547, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07192707061767578, "epoch": 0.36712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.623128256392695e-06, "loss": 0.0, "num_tokens": 209561350.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07184340804815292, "epoch": 0.3672, "grad_norm": 0.0, "learning_rate": 2.6227269952353063e-06, "loss": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 235.2734375, "completions/mean_terminated_length": 221.09210205078125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.06483569741249084, "epoch": 0.36728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6223256729962452e-06, "loss": 0.0, "num_tokens": 209657001.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0650615394115448, "epoch": 0.36736, "grad_norm": 0.0, "learning_rate": 2.6219242897036004e-06, "loss": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.4296875, "completions/mean_terminated_length": 234.09890747070312, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06037522293627262, "epoch": 0.36744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6215228453854648e-06, "loss": 0.0, "num_tokens": 209753312.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0615620631724596, "epoch": 0.36752, "grad_norm": 0.0, "learning_rate": 2.6211213400699354e-06, "loss": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.2734375, "completions/mean_terminated_length": 195.30682373046875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07580798491835594, "epoch": 0.3676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6207197737851125e-06, "loss": 0.0, "num_tokens": 209846275.0, "reward": 0.3799973428249359, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3799973428249359, "rewards/reward_fn/std": 0.9942457675933838, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0739385262131691, "epoch": 0.36768, "grad_norm": 0.0, "learning_rate": 2.6203181465591024e-06, "loss": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.8984375, "completions/mean_terminated_length": 232.13580322265625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06458460353314877, "epoch": 0.36776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6199164584200143e-06, "loss": 0.0, "num_tokens": 209942646.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0693635642528534, "epoch": 0.36784, "grad_norm": 0.0, "learning_rate": 2.6195147093959616e-06, "loss": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.609375, "completions/mean_terminated_length": 208.27027893066406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07259193062782288, "epoch": 0.36792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6191128995150633e-06, "loss": 0.0, "num_tokens": 210035652.0, "reward": 1.2021136283874512, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2021136283874512, "rewards/reward_fn/std": 1.4117597341537476, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07263487577438354, "epoch": 0.368, "grad_norm": 0.0, "learning_rate": 2.6187110288054417e-06, "loss": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.984375, "completions/mean_terminated_length": 239.97727966308594, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.061432139948010445, "epoch": 0.36808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6183090972952234e-06, "loss": 0.0, "num_tokens": 210132546.0, "reward": 0.47506237030029297, "reward_std": 0.0, "rewards/reward_fn/mean": 0.47506237030029297, "rewards/reward_fn/std": 0.9935429096221924, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05710231326520443, "epoch": 0.36816, "grad_norm": 0.0, "learning_rate": 2.6179071050125397e-06, "loss": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.3828125, "completions/mean_terminated_length": 224.9101104736328, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06597131490707397, "epoch": 0.36824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6175050519855247e-06, "loss": 0.0, "num_tokens": 210228083.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06895844265818596, "epoch": 0.36832, "grad_norm": 0.0, "learning_rate": 2.6171029382423198e-06, "loss": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.4921875, "completions/mean_terminated_length": 204.41748046875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06977640837430954, "epoch": 0.3684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.616700763811067e-06, "loss": 0.0, "num_tokens": 210321074.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07332972064614296, "epoch": 0.36848, "grad_norm": 0.0, "learning_rate": 2.616298528719914e-06, "loss": 0.0, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8671875, "completions/mean_terminated_length": 219.2261962890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07474692165851593, "epoch": 0.36856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6158962329970146e-06, "loss": 0.0, "num_tokens": 210416289.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07358717918395996, "epoch": 0.36864, "grad_norm": 0.0, "learning_rate": 2.615493876670525e-06, "loss": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.328125, "completions/mean_terminated_length": 208.11111450195312, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.0727524422109127, "epoch": 0.36872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6150914597686043e-06, "loss": 0.0, "num_tokens": 210510283.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07368185371160507, "epoch": 0.3688, "grad_norm": 0.0, "learning_rate": 2.614688982319419e-06, "loss": 0.0, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.578125, "completions/mean_terminated_length": 219.2391357421875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07949410751461983, "epoch": 0.36888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.614286444351138e-06, "loss": 0.0, "num_tokens": 210605205.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07626955211162567, "epoch": 0.36896, "grad_norm": 0.0, "learning_rate": 2.6138838458919344e-06, "loss": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.3125, "completions/mean_terminated_length": 214.18556213378906, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07156764343380928, "epoch": 0.36904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.613481186969985e-06, "loss": 0.0, "num_tokens": 210699453.0, "reward": -0.375, "reward_std": 0.0, "rewards/reward_fn/mean": -0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07370319589972496, "epoch": 0.36912, "grad_norm": 0.0, "learning_rate": 2.6130784676134737e-06, "loss": 0.0, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.453125, "completions/mean_terminated_length": 237.50685119628906, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.07346877083182335, "epoch": 0.3692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6126756878505854e-06, "loss": 0.0, "num_tokens": 210796407.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07856309413909912, "epoch": 0.36928, "grad_norm": 0.0, "learning_rate": 2.61227284770951e-06, "loss": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 210.8359375, "completions/mean_terminated_length": 198.7623748779297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.060105327516794205, "epoch": 0.36936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6118699472184433e-06, "loss": 0.0, "num_tokens": 210888930.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.058130571618676186, "epoch": 0.36944, "grad_norm": 0.0, "learning_rate": 2.611466986405583e-06, "loss": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.734375, "completions/mean_terminated_length": 217.11428833007812, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06135466508567333, "epoch": 0.36952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.611063965299132e-06, "loss": 0.0, "num_tokens": 210984512.0, "reward": 0.4505459666252136, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4505459666252136, "rewards/reward_fn/std": 0.987565279006958, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.063521858304739, "epoch": 0.3696, "grad_norm": 0.0, "learning_rate": 2.6106608839272986e-06, "loss": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.9609375, "completions/mean_terminated_length": 224.41539001464844, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06456084549427032, "epoch": 0.36968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.610257742318294e-06, "loss": 0.0, "num_tokens": 211080763.0, "reward": 0.3972600996494293, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3972600996494293, "rewards/reward_fn/std": 0.9893408417701721, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06327711418271065, "epoch": 0.36976, "grad_norm": 0.0, "learning_rate": 2.6098545405003322e-06, "loss": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.5859375, "completions/mean_terminated_length": 241.39999389648438, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06514900922775269, "epoch": 0.36984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.609451278501635e-06, "loss": 0.0, "num_tokens": 211178118.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06840862706303596, "epoch": 0.36992, "grad_norm": 0.0, "learning_rate": 2.609047956350426e-06, "loss": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 202.609375, "completions/mean_terminated_length": 186.96969604492188, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07557953521609306, "epoch": 0.37, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.608644574074933e-06, "loss": 0.0, "num_tokens": 211269588.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06901627779006958, "epoch": 0.37008, "grad_norm": 0.0, "learning_rate": 2.6082411317033894e-06, "loss": 0.0, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.6015625, "completions/mean_terminated_length": 214.52999877929688, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06810101866722107, "epoch": 0.37016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.607837629264031e-06, "loss": 0.0, "num_tokens": 211363745.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0697370208799839, "epoch": 0.37024, "grad_norm": 0.0, "learning_rate": 2.6074340667850987e-06, "loss": 0.0, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.5625, "completions/mean_terminated_length": 216.6511688232422, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06758035346865654, "epoch": 0.37032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6070304442948383e-06, "loss": 0.0, "num_tokens": 211458665.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06494555249810219, "epoch": 0.3704, "grad_norm": 0.0, "learning_rate": 2.6066267618214984e-06, "loss": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.171875, "completions/mean_terminated_length": 198.87356567382812, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07076921314001083, "epoch": 0.37048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.606223019393333e-06, "loss": 0.0, "num_tokens": 211551999.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0718168132007122, "epoch": 0.37056, "grad_norm": 0.0, "learning_rate": 2.6058192170386004e-06, "loss": 0.0, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.921875, "completions/mean_terminated_length": 234.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06343194469809532, "epoch": 0.37064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.605415354785561e-06, "loss": 0.0, "num_tokens": 211648245.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06637309864163399, "epoch": 0.37072, "grad_norm": 0.0, "learning_rate": 2.6050114326624814e-06, "loss": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.6953125, "completions/mean_terminated_length": 209.6630401611328, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07055513188242912, "epoch": 0.3708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.604607450697633e-06, "loss": 0.0, "num_tokens": 211742286.0, "reward": 0.10834798961877823, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10834798961877823, "rewards/reward_fn/std": 0.2877882122993469, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06917648389935493, "epoch": 0.37088, "grad_norm": 0.0, "learning_rate": 2.6042034089192883e-06, "loss": 0.0, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.6328125, "completions/mean_terminated_length": 239.68040466308594, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06637460365891457, "epoch": 0.37096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.603799307355728e-06, "loss": 0.0, "num_tokens": 211839007.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06685024499893188, "epoch": 0.37104, "grad_norm": 0.0, "learning_rate": 2.603395146035234e-06, "loss": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.84375, "completions/mean_terminated_length": 228.5500030517578, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06439918465912342, "epoch": 0.37112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6029909249860933e-06, "loss": 0.0, "num_tokens": 211935115.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06749451532959938, "epoch": 0.3712, "grad_norm": 0.0, "learning_rate": 2.6025866442365968e-06, "loss": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.140625, "completions/mean_terminated_length": 241.29412841796875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06723766028881073, "epoch": 0.37128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6021823038150398e-06, "loss": 0.0, "num_tokens": 212032669.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0653783529996872, "epoch": 0.37136, "grad_norm": 0.0, "learning_rate": 2.601777903749723e-06, "loss": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.65625, "completions/mean_terminated_length": 212.8275909423828, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06620556116104126, "epoch": 0.37144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.6013734440689493e-06, "loss": 0.0, "num_tokens": 212127217.0, "reward": 0.7974936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7974936366081238, "rewards/reward_fn/std": 1.2825363874435425, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0659705251455307, "epoch": 0.37152, "grad_norm": 0.0, "learning_rate": 2.600968924801027e-06, "loss": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 245.4765625, "completions/mean_terminated_length": 229.05999755859375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06882020831108093, "epoch": 0.3716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.600564345974267e-06, "loss": 0.0, "num_tokens": 212224174.0, "reward": 0.06541186571121216, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06541186571121216, "rewards/reward_fn/std": 0.121230348944664, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06593615189194679, "epoch": 0.37168, "grad_norm": 0.0, "learning_rate": 2.600159707616987e-06, "loss": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.9296875, "completions/mean_terminated_length": 191.90625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07233559340238571, "epoch": 0.37176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.599755009757507e-06, "loss": 0.0, "num_tokens": 212316325.0, "reward": 0.874853253364563, "reward_std": 0.0, "rewards/reward_fn/mean": 0.874853253364563, "rewards/reward_fn/std": 1.2735817432403564, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06995037943124771, "epoch": 0.37184, "grad_norm": 0.0, "learning_rate": 2.599350252424151e-06, "loss": 0.0, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 234.6865692138672, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06980862468481064, "epoch": 0.37192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5989454356452485e-06, "loss": 0.0, "num_tokens": 212413201.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07178154960274696, "epoch": 0.372, "grad_norm": 0.0, "learning_rate": 2.598540559449132e-06, "loss": 0.0, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.09375, "completions/mean_terminated_length": 220.67469787597656, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0751766674220562, "epoch": 0.37208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.598135623864139e-06, "loss": 0.0, "num_tokens": 212508573.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08088899031281471, "epoch": 0.37216, "grad_norm": 0.0, "learning_rate": 2.5977306289186096e-06, "loss": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 187.578125, "completions/mean_terminated_length": 182.4033660888672, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06549526751041412, "epoch": 0.37224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5973255746408904e-06, "loss": 0.0, "num_tokens": 212598119.0, "reward": 0.7996163368225098, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7996163368225098, "rewards/reward_fn/std": 1.281852126121521, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06719431281089783, "epoch": 0.37232, "grad_norm": 0.0, "learning_rate": 2.5969204610593296e-06, "loss": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 229.2903289794922, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07647084444761276, "epoch": 0.3724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.596515288202282e-06, "loss": 0.0, "num_tokens": 212693939.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0726417824625969, "epoch": 0.37248, "grad_norm": 0.0, "learning_rate": 2.5961100560981064e-06, "loss": 0.0, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.71875, "completions/mean_terminated_length": 207.9574432373047, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06901124864816666, "epoch": 0.37256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.595704764775162e-06, "loss": 0.0, "num_tokens": 212787727.0, "reward": 0.0688910037279129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0688910037279129, "rewards/reward_fn/std": 0.18298465013504028, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07165145874023438, "epoch": 0.37264, "grad_norm": 0.0, "learning_rate": 2.5952994142618176e-06, "loss": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.0390625, "completions/mean_terminated_length": 216.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.06320244446396828, "epoch": 0.37272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5948940045864413e-06, "loss": 0.0, "num_tokens": 212882836.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.062102071940898895, "epoch": 0.3728, "grad_norm": 0.0, "learning_rate": 2.5944885357774084e-06, "loss": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.0546875, "completions/mean_terminated_length": 212.63394165039062, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0635487399995327, "epoch": 0.37288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.594083007863098e-06, "loss": 0.0, "num_tokens": 212976283.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06332285329699516, "epoch": 0.37296, "grad_norm": 0.0, "learning_rate": 2.5936774208718926e-06, "loss": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.078125, "completions/mean_terminated_length": 221.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07030325755476952, "epoch": 0.37304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.593271774832178e-06, "loss": 0.0, "num_tokens": 213071269.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0671626552939415, "epoch": 0.37312, "grad_norm": 0.0, "learning_rate": 2.592866069772346e-06, "loss": 0.0, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 251.640625, "completions/mean_terminated_length": 239.09091186523438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.07455514371395111, "epoch": 0.3732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5924603057207912e-06, "loss": 0.0, "num_tokens": 213169015.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07347157970070839, "epoch": 0.37328, "grad_norm": 0.0, "learning_rate": 2.592054482705913e-06, "loss": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 201.04762268066406, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.06907706707715988, "epoch": 0.37336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.591648600756115e-06, "loss": 0.0, "num_tokens": 213262703.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06874950230121613, "epoch": 0.37344, "grad_norm": 0.0, "learning_rate": 2.5912426598998045e-06, "loss": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 230.40000915527344, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07843466103076935, "epoch": 0.37352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.590836660165393e-06, "loss": 0.0, "num_tokens": 213358319.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0752566009759903, "epoch": 0.3736, "grad_norm": 0.0, "learning_rate": 2.590430601581296e-06, "loss": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 236.89552307128906, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07204905524849892, "epoch": 0.37368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.590024484175934e-06, "loss": 0.0, "num_tokens": 213455343.0, "reward": 0.12701314687728882, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12701314687728882, "rewards/reward_fn/std": 0.329843133687973, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07534414529800415, "epoch": 0.37376, "grad_norm": 0.0, "learning_rate": 2.5896183079777303e-06, "loss": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 207.13043212890625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07104762643575668, "epoch": 0.37384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5892120730151125e-06, "loss": 0.0, "num_tokens": 213549151.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0721011832356453, "epoch": 0.37392, "grad_norm": 0.0, "learning_rate": 2.588805779316514e-06, "loss": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.2890625, "completions/mean_terminated_length": 195.225341796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07461641728878021, "epoch": 0.374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.58839942691037e-06, "loss": 0.0, "num_tokens": 213643140.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07509075105190277, "epoch": 0.37408, "grad_norm": 0.0, "learning_rate": 2.5879930158251213e-06, "loss": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.3046875, "completions/mean_terminated_length": 220.98611450195312, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.061037616804242134, "epoch": 0.37416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.587586546089213e-06, "loss": 0.0, "num_tokens": 213738923.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0632433295249939, "epoch": 0.37424, "grad_norm": 0.0, "learning_rate": 2.5871800177310926e-06, "loss": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.90625, "completions/mean_terminated_length": 213.8360595703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06733629107475281, "epoch": 0.37432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.586773430779213e-06, "loss": 0.0, "num_tokens": 213834655.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06798244640231133, "epoch": 0.3744, "grad_norm": 0.0, "learning_rate": 2.586366785262032e-06, "loss": 0.0, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.7734375, "completions/mean_terminated_length": 218.0117645263672, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06621215865015984, "epoch": 0.37448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5859600812080087e-06, "loss": 0.0, "num_tokens": 213929730.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 1.3280736207962036, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07161097228527069, "epoch": 0.37456, "grad_norm": 0.0, "learning_rate": 2.5855533186456095e-06, "loss": 0.0, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.4765625, "completions/mean_terminated_length": 238.76344299316406, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07658354565501213, "epoch": 0.37464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5851464976033035e-06, "loss": 0.0, "num_tokens": 214026431.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07573682442307472, "epoch": 0.37472, "grad_norm": 0.0, "learning_rate": 2.584739618109563e-06, "loss": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.59375, "completions/mean_terminated_length": 237.95455932617188, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07061014324426651, "epoch": 0.3748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5843326801928664e-06, "loss": 0.0, "num_tokens": 214123147.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0733492448925972, "epoch": 0.37488, "grad_norm": 0.0, "learning_rate": 2.5839256838816944e-06, "loss": 0.0, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.21875, "completions/mean_terminated_length": 223.41934204101562, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.07414902746677399, "epoch": 0.37496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5835186292045314e-06, "loss": 0.0, "num_tokens": 214219431.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0759132020175457, "epoch": 0.37504, "grad_norm": 0.0, "learning_rate": 2.583111516189869e-06, "loss": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.234375, "completions/mean_terminated_length": 235.19586181640625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07562614604830742, "epoch": 0.37512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5827043448661987e-06, "loss": 0.0, "num_tokens": 214315717.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0743970014154911, "epoch": 0.3752, "grad_norm": 0.0, "learning_rate": 2.5822971152620204e-06, "loss": 0.0, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.2109375, "completions/mean_terminated_length": 220.6962127685547, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.0724596306681633, "epoch": 0.37528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5818898274058345e-06, "loss": 0.0, "num_tokens": 214411232.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07285372540354729, "epoch": 0.37536, "grad_norm": 0.0, "learning_rate": 2.581482481326147e-06, "loss": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 201.0703125, "completions/mean_terminated_length": 183.5154571533203, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.057672712951898575, "epoch": 0.37544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5810750770514677e-06, "loss": 0.0, "num_tokens": 214502505.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05735774524509907, "epoch": 0.37552, "grad_norm": 0.0, "learning_rate": 2.5806676146103107e-06, "loss": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.4765625, "completions/mean_terminated_length": 240.84722900390625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.06979547441005707, "epoch": 0.3756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5802600940311945e-06, "loss": 0.0, "num_tokens": 214599718.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07189447060227394, "epoch": 0.37568, "grad_norm": 0.0, "learning_rate": 2.579852515342641e-06, "loss": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.7109375, "completions/mean_terminated_length": 216.04615783691406, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.066726204007864, "epoch": 0.37576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.579444878573176e-06, "loss": 0.0, "num_tokens": 214695425.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06591852009296417, "epoch": 0.37584, "grad_norm": 0.0, "learning_rate": 2.579037183751331e-06, "loss": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.6484375, "completions/mean_terminated_length": 237.60760498046875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06973468512296677, "epoch": 0.37592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5786294309056383e-06, "loss": 0.0, "num_tokens": 214792276.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668908879160881, "epoch": 0.376, "grad_norm": 0.0, "learning_rate": 2.5782216200646377e-06, "loss": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.7890625, "completions/mean_terminated_length": 197.8942413330078, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07039452344179153, "epoch": 0.37608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5778137512568718e-06, "loss": 0.0, "num_tokens": 214884537.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07362568750977516, "epoch": 0.37616, "grad_norm": 0.0, "learning_rate": 2.577405824510886e-06, "loss": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.0859375, "completions/mean_terminated_length": 199.06756591796875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06614567339420319, "epoch": 0.37624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.576997839855232e-06, "loss": 0.0, "num_tokens": 214978628.0, "reward": 0.4489399194717407, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4489399194717407, "rewards/reward_fn/std": 0.9873224496841431, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06624956429004669, "epoch": 0.37632, "grad_norm": 0.0, "learning_rate": 2.576589797318464e-06, "loss": 0.0, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.015625, "completions/mean_terminated_length": 219.7027130126953, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06295411288738251, "epoch": 0.3764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5761816969291406e-06, "loss": 0.0, "num_tokens": 215074246.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06321068108081818, "epoch": 0.37648, "grad_norm": 0.0, "learning_rate": 2.5757735387158245e-06, "loss": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 249.359375, "completions/mean_terminated_length": 243.68116760253906, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.059154948219656944, "epoch": 0.37656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5753653227070825e-06, "loss": 0.0, "num_tokens": 215171700.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057168371975421906, "epoch": 0.37664, "grad_norm": 0.0, "learning_rate": 2.574957048931485e-06, "loss": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.171875, "completions/mean_terminated_length": 232.97727966308594, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.061613840982317924, "epoch": 0.37672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5745487174176077e-06, "loss": 0.0, "num_tokens": 215267978.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06408611312508583, "epoch": 0.3768, "grad_norm": 0.0, "learning_rate": 2.5741403281940292e-06, "loss": 0.0, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.5234375, "completions/mean_terminated_length": 210.5192413330078, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0661524124443531, "epoch": 0.37688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5737318812893324e-06, "loss": 0.0, "num_tokens": 215363917.0, "reward": 0.4438909888267517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4438909888267517, "rewards/reward_fn/std": 0.9866783618927002, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06589515879750252, "epoch": 0.37696, "grad_norm": 0.0, "learning_rate": 2.5733233767321034e-06, "loss": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.4375, "completions/mean_terminated_length": 197.7777862548828, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07046430930495262, "epoch": 0.37704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5729148145509345e-06, "loss": 0.0, "num_tokens": 215459077.0, "reward": 0.8171311616897583, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8171311616897583, "rewards/reward_fn/std": 1.2771422863006592, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07078713923692703, "epoch": 0.37712, "grad_norm": 0.0, "learning_rate": 2.5725061947744204e-06, "loss": 0.0, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.765625, "completions/mean_terminated_length": 245.89874267578125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06835715100169182, "epoch": 0.3772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5720975174311594e-06, "loss": 0.0, "num_tokens": 215556583.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06737206876277924, "epoch": 0.37728, "grad_norm": 0.0, "learning_rate": 2.5716887825497553e-06, "loss": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.9375, "completions/mean_terminated_length": 199.18182373046875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.07129224017262459, "epoch": 0.37736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5712799901588156e-06, "loss": 0.0, "num_tokens": 215649887.0, "reward": 0.22228211164474487, "reward_std": 0.0, "rewards/reward_fn/mean": 0.22228211164474487, "rewards/reward_fn/std": 0.3028738498687744, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07498493418097496, "epoch": 0.37744, "grad_norm": 0.0, "learning_rate": 2.5708711402869508e-06, "loss": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.578125, "completions/mean_terminated_length": 212.5507354736328, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06382778473198414, "epoch": 0.37752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5704622329627758e-06, "loss": 0.0, "num_tokens": 215745193.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06269434280693531, "epoch": 0.3776, "grad_norm": 0.0, "learning_rate": 2.5700532682149105e-06, "loss": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 229.60000610351562, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07095040008425713, "epoch": 0.37768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5696442460719776e-06, "loss": 0.0, "num_tokens": 215841385.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07098846137523651, "epoch": 0.37776, "grad_norm": 0.0, "learning_rate": 2.5692351665626046e-06, "loss": 0.0, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.1796875, "completions/mean_terminated_length": 191.5287322998047, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07986447587609291, "epoch": 0.37784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5688260297154232e-06, "loss": 0.0, "num_tokens": 215934080.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07278544828295708, "epoch": 0.37792, "grad_norm": 0.0, "learning_rate": 2.5684168355590676e-06, "loss": 0.0, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.984375, "completions/mean_terminated_length": 241.51612854003906, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06870724260807037, "epoch": 0.378, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.568007584122178e-06, "loss": 0.0, "num_tokens": 216031486.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07081728428602219, "epoch": 0.37808, "grad_norm": 0.0, "learning_rate": 2.5675982754333967e-06, "loss": 0.0, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.953125, "completions/mean_terminated_length": 208.47190856933594, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06735830381512642, "epoch": 0.37816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.567188909521372e-06, "loss": 0.0, "num_tokens": 216125560.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06658301502466202, "epoch": 0.37824, "grad_norm": 0.0, "learning_rate": 2.5667794864147552e-06, "loss": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 238.90411376953125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.0626705139875412, "epoch": 0.37832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5663700061422007e-06, "loss": 0.0, "num_tokens": 216222616.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06535421870648861, "epoch": 0.3784, "grad_norm": 0.0, "learning_rate": 2.5659604687323682e-06, "loss": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.1015625, "completions/mean_terminated_length": 213.4571533203125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07923220098018646, "epoch": 0.37848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.565550874213921e-06, "loss": 0.0, "num_tokens": 216316453.0, "reward": 1.2005460262298584, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2005460262298584, "rewards/reward_fn/std": 1.4124207496643066, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07753400132060051, "epoch": 0.37856, "grad_norm": 0.0, "learning_rate": 2.565141222615527e-06, "loss": 0.0, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.9765625, "completions/mean_terminated_length": 223.22093200683594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07599645853042603, "epoch": 0.37864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5647315139658565e-06, "loss": 0.0, "num_tokens": 216411938.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07195372879505157, "epoch": 0.37872, "grad_norm": 0.0, "learning_rate": 2.564321748293585e-06, "loss": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.859375, "completions/mean_terminated_length": 236.16949462890625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06377284973859787, "epoch": 0.3788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5639119256273923e-06, "loss": 0.0, "num_tokens": 216509072.0, "reward": 0.017386555671691895, "reward_std": 0.0, "rewards/reward_fn/mean": 0.017386555671691895, "rewards/reward_fn/std": 0.04618125036358833, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0658142939209938, "epoch": 0.37888, "grad_norm": 0.0, "learning_rate": 2.563502045995961e-06, "loss": 0.0, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.3671875, "completions/mean_terminated_length": 229.98611450195312, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06872018426656723, "epoch": 0.37896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.563092109427979e-06, "loss": 0.0, "num_tokens": 216605503.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06835721805691719, "epoch": 0.37904, "grad_norm": 0.0, "learning_rate": 2.5626821159521375e-06, "loss": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.28125, "completions/mean_terminated_length": 217.38888549804688, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06923848390579224, "epoch": 0.37912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.562272065597131e-06, "loss": 0.0, "num_tokens": 216701027.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07471277937293053, "epoch": 0.3792, "grad_norm": 0.0, "learning_rate": 2.56186195839166e-06, "loss": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.0078125, "completions/mean_terminated_length": 201.1538543701172, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06719117611646652, "epoch": 0.37928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5614517943644255e-06, "loss": 0.0, "num_tokens": 216794340.0, "reward": 0.4990789294242859, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4990789294242859, "rewards/reward_fn/std": 1.0034698247909546, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06278415769338608, "epoch": 0.37936, "grad_norm": 0.0, "learning_rate": 2.5610415735441366e-06, "loss": 0.0, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.203125, "completions/mean_terminated_length": 223.55813598632812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06102779507637024, "epoch": 0.37944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5606312959595035e-06, "loss": 0.0, "num_tokens": 216889854.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06553180515766144, "epoch": 0.37952, "grad_norm": 0.0, "learning_rate": 2.5602209616392416e-06, "loss": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 237.3731231689453, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06559459865093231, "epoch": 0.3796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5598105706120703e-06, "loss": 0.0, "num_tokens": 216986910.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07058801874518394, "epoch": 0.37968, "grad_norm": 0.0, "learning_rate": 2.559400122906712e-06, "loss": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.046875, "completions/mean_terminated_length": 215.65789794921875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07053955644369125, "epoch": 0.37976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5589896185518943e-06, "loss": 0.0, "num_tokens": 217082148.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06871749460697174, "epoch": 0.37984, "grad_norm": 0.0, "learning_rate": 2.558579057576347e-06, "loss": 0.0, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.9140625, "completions/mean_terminated_length": 190.06944274902344, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07522139698266983, "epoch": 0.37992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.558168440008807e-06, "loss": 0.0, "num_tokens": 217175705.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07601603120565414, "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 2.557757765878011e-06, "loss": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.9140625, "completions/mean_terminated_length": 227.60293579101562, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06404894217848778, "epoch": 0.38008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5573470352127036e-06, "loss": 0.0, "num_tokens": 217272078.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06517649814486504, "epoch": 0.38016, "grad_norm": 0.0, "learning_rate": 2.556936248041631e-06, "loss": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.953125, "completions/mean_terminated_length": 208.85057067871094, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0686408020555973, "epoch": 0.38024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.556525404393544e-06, "loss": 0.0, "num_tokens": 217366280.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06978928297758102, "epoch": 0.38032, "grad_norm": 0.0, "learning_rate": 2.556114504297196e-06, "loss": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.609375, "completions/mean_terminated_length": 236.10101318359375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06595707684755325, "epoch": 0.3804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5557035477813483e-06, "loss": 0.0, "num_tokens": 217462614.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07090917602181435, "epoch": 0.38048, "grad_norm": 0.0, "learning_rate": 2.555292534874761e-06, "loss": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.6328125, "completions/mean_terminated_length": 195.37037658691406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0778287723660469, "epoch": 0.38056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5548814656062027e-06, "loss": 0.0, "num_tokens": 217556007.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07866884395480156, "epoch": 0.38064, "grad_norm": 0.0, "learning_rate": 2.5544703400044425e-06, "loss": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.6484375, "completions/mean_terminated_length": 232.86459350585938, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06683005765080452, "epoch": 0.38072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5540591580982555e-06, "loss": 0.0, "num_tokens": 217652090.0, "reward": 0.08703220635652542, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08703220635652542, "rewards/reward_fn/std": 0.23117035627365112, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06818310543894768, "epoch": 0.3808, "grad_norm": 0.0, "learning_rate": 2.5536479199164196e-06, "loss": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.4140625, "completions/mean_terminated_length": 205.6931915283203, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07252231240272522, "epoch": 0.38088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.553236625487718e-06, "loss": 0.0, "num_tokens": 217745967.0, "reward": 0.43031713366508484, "reward_std": 0.0, "rewards/reward_fn/mean": 0.43031713366508484, "rewards/reward_fn/std": 0.9821427464485168, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07111237943172455, "epoch": 0.38096, "grad_norm": 0.0, "learning_rate": 2.5528252748409355e-06, "loss": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 236.1015625, "completions/mean_terminated_length": 226.3837127685547, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06575261801481247, "epoch": 0.38104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.552413868004864e-06, "loss": 0.0, "num_tokens": 217841724.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06778870522975922, "epoch": 0.38112, "grad_norm": 0.0, "learning_rate": 2.5520024050082963e-06, "loss": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.7578125, "completions/mean_terminated_length": 211.56666564941406, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0691610798239708, "epoch": 0.3812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5515908858800314e-06, "loss": 0.0, "num_tokens": 217936029.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0697425827383995, "epoch": 0.38128, "grad_norm": 0.0, "learning_rate": 2.551179310648871e-06, "loss": 0.0, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.84375, "completions/mean_terminated_length": 200.05796813964844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06586236134171486, "epoch": 0.38136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5507676793436205e-06, "loss": 0.0, "num_tokens": 218030473.0, "reward": 0.7817869186401367, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7817869186401367, "rewards/reward_fn/std": 1.2883555889129639, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06653960421681404, "epoch": 0.38144, "grad_norm": 0.0, "learning_rate": 2.55035599199309e-06, "loss": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.8828125, "completions/mean_terminated_length": 177.41250610351562, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06826356798410416, "epoch": 0.38152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5499442486260936e-06, "loss": 0.0, "num_tokens": 218122490.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07024513557553291, "epoch": 0.3816, "grad_norm": 0.0, "learning_rate": 2.5495324492714493e-06, "loss": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.4765625, "completions/mean_terminated_length": 211.32894897460938, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06955941393971443, "epoch": 0.38168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5491205939579777e-06, "loss": 0.0, "num_tokens": 218217399.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06778738647699356, "epoch": 0.38176, "grad_norm": 0.0, "learning_rate": 2.548708682714505e-06, "loss": 0.0, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.3984375, "completions/mean_terminated_length": 228.26388549804688, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.06823668628931046, "epoch": 0.38184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5482967155698603e-06, "loss": 0.0, "num_tokens": 218313706.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0631241574883461, "epoch": 0.38192, "grad_norm": 0.0, "learning_rate": 2.547884692552877e-06, "loss": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 221.0859375, "completions/mean_terminated_length": 191.2318878173828, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06580561026930809, "epoch": 0.382, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5474726136923925e-06, "loss": 0.0, "num_tokens": 218407541.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.068902887403965, "epoch": 0.38208, "grad_norm": 0.0, "learning_rate": 2.5470604790172486e-06, "loss": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 223.74026489257812, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.0741766206920147, "epoch": 0.38216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5466482885562895e-06, "loss": 0.0, "num_tokens": 218503361.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07343157008290291, "epoch": 0.38224, "grad_norm": 0.0, "learning_rate": 2.5462360423383638e-06, "loss": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.390625, "completions/mean_terminated_length": 214.1666717529297, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07015539333224297, "epoch": 0.38232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5458237403923253e-06, "loss": 0.0, "num_tokens": 218599155.0, "reward": 0.4438909888267517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4438909888267517, "rewards/reward_fn/std": 0.9866783618927002, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06684791296720505, "epoch": 0.3824, "grad_norm": 0.0, "learning_rate": 2.5454113827470303e-06, "loss": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.9296875, "completions/mean_terminated_length": 210.287353515625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06709428504109383, "epoch": 0.38248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.54499896943134e-06, "loss": 0.0, "num_tokens": 218693482.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0673433244228363, "epoch": 0.38256, "grad_norm": 0.0, "learning_rate": 2.5445865004741185e-06, "loss": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.59375, "completions/mean_terminated_length": 193.81817626953125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07454071566462517, "epoch": 0.38264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.544173975904234e-06, "loss": 0.0, "num_tokens": 218786998.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07332398742437363, "epoch": 0.38272, "grad_norm": 0.0, "learning_rate": 2.5437613957505596e-06, "loss": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.9296875, "completions/mean_terminated_length": 233.8275909423828, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.07280907407402992, "epoch": 0.3828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.543348760041972e-06, "loss": 0.0, "num_tokens": 218883373.0, "reward": 0.0688910037279129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0688910037279129, "rewards/reward_fn/std": 0.18298465013504028, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07733948528766632, "epoch": 0.38288, "grad_norm": 0.0, "learning_rate": 2.5429360688073497e-06, "loss": 0.0, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.109375, "completions/mean_terminated_length": 216.9591827392578, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07277487218379974, "epoch": 0.38296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5425233220755773e-06, "loss": 0.0, "num_tokens": 218977851.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07196490839123726, "epoch": 0.38304, "grad_norm": 0.0, "learning_rate": 2.542110519875543e-06, "loss": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.5390625, "completions/mean_terminated_length": 229.7058868408203, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.07054471597075462, "epoch": 0.38312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.541697662236139e-06, "loss": 0.0, "num_tokens": 219073920.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06973648816347122, "epoch": 0.3832, "grad_norm": 0.0, "learning_rate": 2.5412847491862606e-06, "loss": 0.0, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.578125, "completions/mean_terminated_length": 233.6056365966797, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06964682787656784, "epoch": 0.38328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.540871780754807e-06, "loss": 0.0, "num_tokens": 219170634.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06808650493621826, "epoch": 0.38336, "grad_norm": 0.0, "learning_rate": 2.5404587569706824e-06, "loss": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.9296875, "completions/mean_terminated_length": 223.1511688232422, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07098673656582832, "epoch": 0.38344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5400456778627927e-06, "loss": 0.0, "num_tokens": 219266113.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07383247464895248, "epoch": 0.38352, "grad_norm": 0.0, "learning_rate": 2.539632543460051e-06, "loss": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.4609375, "completions/mean_terminated_length": 219.10101318359375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07373282313346863, "epoch": 0.3836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.539219353791371e-06, "loss": 0.0, "num_tokens": 219360764.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07560446485877037, "epoch": 0.38368, "grad_norm": 0.0, "learning_rate": 2.538806108885672e-06, "loss": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.796875, "completions/mean_terminated_length": 221.759033203125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06846636906266212, "epoch": 0.38376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5383928087718764e-06, "loss": 0.0, "num_tokens": 219456226.0, "reward": 0.43078044056892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.43078044056892395, "rewards/reward_fn/std": 0.9858564734458923, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06877651810646057, "epoch": 0.38384, "grad_norm": 0.0, "learning_rate": 2.5379794534789116e-06, "loss": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.7890625, "completions/mean_terminated_length": 205.14083862304688, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07045856863260269, "epoch": 0.38392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5375660430357073e-06, "loss": 0.0, "num_tokens": 219550919.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07181033864617348, "epoch": 0.384, "grad_norm": 0.0, "learning_rate": 2.537152577471199e-06, "loss": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 187.96038818359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.061054227873682976, "epoch": 0.38408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5367390568143235e-06, "loss": 0.0, "num_tokens": 219642351.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06003405153751373, "epoch": 0.38416, "grad_norm": 0.0, "learning_rate": 2.5363254810940234e-06, "loss": 0.0, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 236.4383544921875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06520985439419746, "epoch": 0.38424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5359118503392453e-06, "loss": 0.0, "num_tokens": 219739227.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06670785322785378, "epoch": 0.38432, "grad_norm": 0.0, "learning_rate": 2.535498164578938e-06, "loss": 0.0, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.5234375, "completions/mean_terminated_length": 201.44117736816406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06905689090490341, "epoch": 0.3844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5350844238420562e-06, "loss": 0.0, "num_tokens": 219831966.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07123270630836487, "epoch": 0.38448, "grad_norm": 0.0, "learning_rate": 2.534670628157557e-06, "loss": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.828125, "completions/mean_terminated_length": 218.7777862548828, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06525131687521935, "epoch": 0.38456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5342567775544007e-06, "loss": 0.0, "num_tokens": 219926920.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06605512648820877, "epoch": 0.38464, "grad_norm": 0.0, "learning_rate": 2.5338428720615533e-06, "loss": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.3359375, "completions/mean_terminated_length": 228.93548583984375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06843432411551476, "epoch": 0.38472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.533428911707984e-06, "loss": 0.0, "num_tokens": 220022707.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06798898801207542, "epoch": 0.3848, "grad_norm": 0.0, "learning_rate": 2.5330148965226655e-06, "loss": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.65625, "completions/mean_terminated_length": 236.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07289990782737732, "epoch": 0.38488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5326008265345747e-06, "loss": 0.0, "num_tokens": 220120071.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.073982372879982, "epoch": 0.38496, "grad_norm": 0.0, "learning_rate": 2.532186701772692e-06, "loss": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.671875, "completions/mean_terminated_length": 215.3975830078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06526057422161102, "epoch": 0.38504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.531772522266001e-06, "loss": 0.0, "num_tokens": 220215005.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06666094809770584, "epoch": 0.38512, "grad_norm": 0.0, "learning_rate": 2.531358288043491e-06, "loss": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.2109375, "completions/mean_terminated_length": 212.247314453125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07448210567235947, "epoch": 0.3852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.530943999134153e-06, "loss": 0.0, "num_tokens": 220309240.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07004668191075325, "epoch": 0.38528, "grad_norm": 0.0, "learning_rate": 2.530529655566984e-06, "loss": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.1796875, "completions/mean_terminated_length": 222.08572387695312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06549012288451195, "epoch": 0.38536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5301152573709825e-06, "loss": 0.0, "num_tokens": 220403983.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06760114058852196, "epoch": 0.38544, "grad_norm": 0.0, "learning_rate": 2.5297008045751532e-06, "loss": 0.0, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.3203125, "completions/mean_terminated_length": 211.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06546483188867569, "epoch": 0.38552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5292862972085025e-06, "loss": 0.0, "num_tokens": 220498360.0, "reward": 0.41141408681869507, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41141408681869507, "rewards/reward_fn/std": 0.9868917465209961, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06755347177386284, "epoch": 0.3856, "grad_norm": 0.0, "learning_rate": 2.5288717353000416e-06, "loss": 0.0, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.59375, "completions/mean_terminated_length": 198.76470947265625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07543138787150383, "epoch": 0.38568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5284571188787858e-06, "loss": 0.0, "num_tokens": 220592772.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0735439732670784, "epoch": 0.38576, "grad_norm": 0.0, "learning_rate": 2.528042447973754e-06, "loss": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 219.6171875, "completions/mean_terminated_length": 193.90667724609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06790392100811005, "epoch": 0.38584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5276277226139677e-06, "loss": 0.0, "num_tokens": 220686419.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668746717274189, "epoch": 0.38592, "grad_norm": 0.0, "learning_rate": 2.5272129428284553e-06, "loss": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.8984375, "completions/mean_terminated_length": 243.8000030517578, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.0690455362200737, "epoch": 0.386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.526798108646245e-06, "loss": 0.0, "num_tokens": 220783686.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.070882398635149, "epoch": 0.38608, "grad_norm": 0.0, "learning_rate": 2.526383220096372e-06, "loss": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.65625, "completions/mean_terminated_length": 235.9394073486328, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07042524218559265, "epoch": 0.38616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.525968277207874e-06, "loss": 0.0, "num_tokens": 220880666.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06546580791473389, "epoch": 0.38624, "grad_norm": 0.0, "learning_rate": 2.5255532800097916e-06, "loss": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.9609375, "completions/mean_terminated_length": 218.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06963027641177177, "epoch": 0.38632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5251382285311713e-06, "loss": 0.0, "num_tokens": 220975637.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06959310546517372, "epoch": 0.3864, "grad_norm": 0.0, "learning_rate": 2.5247231228010623e-06, "loss": 0.0, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.0546875, "completions/mean_terminated_length": 218.73959350585938, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07374263182282448, "epoch": 0.38648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5243079628485165e-06, "loss": 0.0, "num_tokens": 221070364.0, "reward": 0.1532493382692337, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1532493382692337, "rewards/reward_fn/std": 0.29444992542266846, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07421007379889488, "epoch": 0.38656, "grad_norm": 0.0, "learning_rate": 2.5238927487025927e-06, "loss": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.4765625, "completions/mean_terminated_length": 217.20193481445312, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06632540002465248, "epoch": 0.38664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5234774803923496e-06, "loss": 0.0, "num_tokens": 221164633.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06034359335899353, "epoch": 0.38672, "grad_norm": 0.0, "learning_rate": 2.5230621579468527e-06, "loss": 0.0, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.390625, "completions/mean_terminated_length": 207.58181762695312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0605532880872488, "epoch": 0.3868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5226467813951696e-06, "loss": 0.0, "num_tokens": 221257611.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06226526387035847, "epoch": 0.38688, "grad_norm": 0.0, "learning_rate": 2.5222313507663723e-06, "loss": 0.0, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.1796875, "completions/mean_terminated_length": 236.21429443359375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06955011188983917, "epoch": 0.38696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.521815866089537e-06, "loss": 0.0, "num_tokens": 221354530.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0713442824780941, "epoch": 0.38704, "grad_norm": 0.0, "learning_rate": 2.521400327393743e-06, "loss": 0.0, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.8984375, "completions/mean_terminated_length": 215.74697875976562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07301691547036171, "epoch": 0.38712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5209847347080738e-06, "loss": 0.0, "num_tokens": 221449493.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07417768985033035, "epoch": 0.3872, "grad_norm": 0.0, "learning_rate": 2.520569088061616e-06, "loss": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 247.54286193847656, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.07833841443061829, "epoch": 0.38728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5201533874834606e-06, "loss": 0.0, "num_tokens": 221547205.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07744579389691353, "epoch": 0.38736, "grad_norm": 0.0, "learning_rate": 2.519737633002703e-06, "loss": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.203125, "completions/mean_terminated_length": 246.10667419433594, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06546653807163239, "epoch": 0.38744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5193218246484407e-06, "loss": 0.0, "num_tokens": 221644767.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06236748769879341, "epoch": 0.38752, "grad_norm": 0.0, "learning_rate": 2.518905962449776e-06, "loss": 0.0, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.5703125, "completions/mean_terminated_length": 236.7604217529297, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06863383576273918, "epoch": 0.3876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.518490046435816e-06, "loss": 0.0, "num_tokens": 221741224.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06640248745679855, "epoch": 0.38768, "grad_norm": 0.0, "learning_rate": 2.5180740766356686e-06, "loss": 0.0, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.28125, "completions/mean_terminated_length": 213.26316833496094, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06474476680159569, "epoch": 0.38776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5176580530784486e-06, "loss": 0.0, "num_tokens": 221835468.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06331012025475502, "epoch": 0.38784, "grad_norm": 0.0, "learning_rate": 2.517241975793273e-06, "loss": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.234375, "completions/mean_terminated_length": 203.71112060546875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06592773646116257, "epoch": 0.38792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.516825844809263e-06, "loss": 0.0, "num_tokens": 221929066.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06986846029758453, "epoch": 0.388, "grad_norm": 0.0, "learning_rate": 2.5164096601555424e-06, "loss": 0.0, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.3046875, "completions/mean_terminated_length": 225.05101013183594, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.07252867519855499, "epoch": 0.38808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5159934218612405e-06, "loss": 0.0, "num_tokens": 222024337.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07262393087148666, "epoch": 0.38816, "grad_norm": 0.0, "learning_rate": 2.5155771299554896e-06, "loss": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.9296875, "completions/mean_terminated_length": 221.859375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06892210617661476, "epoch": 0.38824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.515160784467426e-06, "loss": 0.0, "num_tokens": 222120456.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07114503160119057, "epoch": 0.38832, "grad_norm": 0.0, "learning_rate": 2.514744385426189e-06, "loss": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 219.7872314453125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.06694652140140533, "epoch": 0.3884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5143279328609216e-06, "loss": 0.0, "num_tokens": 222215356.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07019728794693947, "epoch": 0.38848, "grad_norm": 0.0, "learning_rate": 2.513911426800772e-06, "loss": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.59375, "completions/mean_terminated_length": 222.1999969482422, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0712483748793602, "epoch": 0.38856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5134948672748913e-06, "loss": 0.0, "num_tokens": 222310280.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06981028988957405, "epoch": 0.38864, "grad_norm": 0.0, "learning_rate": 2.513078254312434e-06, "loss": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 236.40000915527344, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07048462331295013, "epoch": 0.38872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.512661587942558e-06, "loss": 0.0, "num_tokens": 222407016.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06891653686761856, "epoch": 0.3888, "grad_norm": 0.0, "learning_rate": 2.5122448681944267e-06, "loss": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.6484375, "completions/mean_terminated_length": 232.44871520996094, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07075272127985954, "epoch": 0.38888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.511828095097205e-06, "loss": 0.0, "num_tokens": 222503483.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07501672953367233, "epoch": 0.38896, "grad_norm": 0.0, "learning_rate": 2.5114112686800633e-06, "loss": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.8671875, "completions/mean_terminated_length": 240.84042358398438, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.0681193396449089, "epoch": 0.38904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.510994388972174e-06, "loss": 0.0, "num_tokens": 222600362.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06748165935277939, "epoch": 0.38912, "grad_norm": 0.0, "learning_rate": 2.510577456002716e-06, "loss": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.390625, "completions/mean_terminated_length": 237.32203674316406, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07334639132022858, "epoch": 0.3892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.51016046980087e-06, "loss": 0.0, "num_tokens": 222697564.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07458550110459328, "epoch": 0.38928, "grad_norm": 0.0, "learning_rate": 2.5097434303958186e-06, "loss": 0.0, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 236.093017578125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07565201073884964, "epoch": 0.38936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.509326337816752e-06, "loss": 0.0, "num_tokens": 222794156.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07646529376506805, "epoch": 0.38944, "grad_norm": 0.0, "learning_rate": 2.508909192092861e-06, "loss": 0.0, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 214.7890625, "completions/mean_terminated_length": 204.78640747070312, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07187702134251595, "epoch": 0.38952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.508491993253343e-06, "loss": 0.0, "num_tokens": 222887185.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07337955012917519, "epoch": 0.3896, "grad_norm": 0.0, "learning_rate": 2.5080747413273964e-06, "loss": 0.0, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.7734375, "completions/mean_terminated_length": 235.3529510498047, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06639420241117477, "epoch": 0.38968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.507657436344225e-06, "loss": 0.0, "num_tokens": 222984436.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06916727125644684, "epoch": 0.38976, "grad_norm": 0.0, "learning_rate": 2.507240078333035e-06, "loss": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.109375, "completions/mean_terminated_length": 234.86111450195312, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06809874624013901, "epoch": 0.38984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5068226673230377e-06, "loss": 0.0, "num_tokens": 223081218.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06889593973755836, "epoch": 0.38992, "grad_norm": 0.0, "learning_rate": 2.506405203343447e-06, "loss": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.34375, "completions/mean_terminated_length": 233.483154296875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06977036967873573, "epoch": 0.39, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5059876864234816e-06, "loss": 0.0, "num_tokens": 223177518.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07011934742331505, "epoch": 0.39008, "grad_norm": 0.0, "learning_rate": 2.5055701165923625e-06, "loss": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 239.71795654296875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.06594285368919373, "epoch": 0.39016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5051524938793156e-06, "loss": 0.0, "num_tokens": 223274552.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06645172834396362, "epoch": 0.39024, "grad_norm": 0.0, "learning_rate": 2.5047348183135706e-06, "loss": 0.0, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.390625, "completions/mean_terminated_length": 198.19801330566406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06740352138876915, "epoch": 0.39032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5043170899243594e-06, "loss": 0.0, "num_tokens": 223367018.0, "reward": 0.4919261336326599, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4919261336326599, "rewards/reward_fn/std": 1.0000982284545898, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06543517112731934, "epoch": 0.3904, "grad_norm": 0.0, "learning_rate": 2.5038993087409195e-06, "loss": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 203.86517333984375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06507521867752075, "epoch": 0.39048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5034814747924897e-06, "loss": 0.0, "num_tokens": 223460682.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06304841674864292, "epoch": 0.39056, "grad_norm": 0.0, "learning_rate": 2.5030635881083155e-06, "loss": 0.0, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.265625, "completions/mean_terminated_length": 239.50001525878906, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.0689949057996273, "epoch": 0.39064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5026456487176436e-06, "loss": 0.0, "num_tokens": 223557996.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06984983757138252, "epoch": 0.39072, "grad_norm": 0.0, "learning_rate": 2.5022276566497263e-06, "loss": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.5390625, "completions/mean_terminated_length": 228.44952392578125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06654157117009163, "epoch": 0.3908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.5018096119338176e-06, "loss": 0.0, "num_tokens": 223653297.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0677185133099556, "epoch": 0.39088, "grad_norm": 0.0, "learning_rate": 2.5013915145991774e-06, "loss": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 214.85716247558594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07164029031991959, "epoch": 0.39096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.500973364675067e-06, "loss": 0.0, "num_tokens": 223749009.0, "reward": 0.09519927203655243, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09519927203655243, "rewards/reward_fn/std": 0.2528632879257202, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07226352393627167, "epoch": 0.39104, "grad_norm": 0.0, "learning_rate": 2.5005551621907523e-06, "loss": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.6640625, "completions/mean_terminated_length": 214.50457763671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06816546246409416, "epoch": 0.39112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.500136907175504e-06, "loss": 0.0, "num_tokens": 223842790.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07440197095274925, "epoch": 0.3912, "grad_norm": 0.0, "learning_rate": 2.4997185996585954e-06, "loss": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.953125, "completions/mean_terminated_length": 216.30088806152344, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.06529083847999573, "epoch": 0.39128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.499300239669303e-06, "loss": 0.0, "num_tokens": 223936608.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06357160955667496, "epoch": 0.39136, "grad_norm": 0.0, "learning_rate": 2.498881827236908e-06, "loss": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 210.1015625, "completions/mean_terminated_length": 197.83168029785156, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06216084957122803, "epoch": 0.39144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.498463362390695e-06, "loss": 0.0, "num_tokens": 224029037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_fn/mean": 1.0, "rewards/reward_fn/std": 1.3280736207962036, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06234530359506607, "epoch": 0.39152, "grad_norm": 0.0, "learning_rate": 2.4980448451599516e-06, "loss": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 212.9708709716797, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07452591136097908, "epoch": 0.3916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4976262755739696e-06, "loss": 0.0, "num_tokens": 224122909.0, "reward": 0.45211365818977356, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45211365818977356, "rewards/reward_fn/std": 0.9878202080726624, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07177290320396423, "epoch": 0.39168, "grad_norm": 0.0, "learning_rate": 2.4972076536620457e-06, "loss": 0.0, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 202.8203125, "completions/mean_terminated_length": 186.5408172607422, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07274030521512032, "epoch": 0.39176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.496788979453477e-06, "loss": 0.0, "num_tokens": 224214406.0, "reward": 0.11205794662237167, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11205794662237167, "rewards/reward_fn/std": 0.2827778458595276, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07249831780791283, "epoch": 0.39184, "grad_norm": 0.0, "learning_rate": 2.496370252977568e-06, "loss": 0.0, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6796875, "completions/mean_terminated_length": 241.08750915527344, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.06767336279153824, "epoch": 0.39192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.495951474263624e-06, "loss": 0.0, "num_tokens": 224311517.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663917288184166, "epoch": 0.392, "grad_norm": 0.0, "learning_rate": 2.495532643340956e-06, "loss": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.9140625, "completions/mean_terminated_length": 240.55223083496094, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.07415047287940979, "epoch": 0.39208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4951137602388773e-06, "loss": 0.0, "num_tokens": 224408786.0, "reward": 0.12067671120166779, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12067671120166779, "rewards/reward_fn/std": 0.3205351233482361, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07594047859311104, "epoch": 0.39216, "grad_norm": 0.0, "learning_rate": 2.494694824986705e-06, "loss": 0.0, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.2734375, "completions/mean_terminated_length": 228.76136779785156, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.07471190020442009, "epoch": 0.39224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4942758376137604e-06, "loss": 0.0, "num_tokens": 224504693.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07752436399459839, "epoch": 0.39232, "grad_norm": 0.0, "learning_rate": 2.4938567981493685e-06, "loss": 0.0, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.984375, "completions/mean_terminated_length": 198.65670776367188, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07512203603982925, "epoch": 0.3924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4934377066228567e-06, "loss": 0.0, "num_tokens": 224599155.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07599545642733574, "epoch": 0.39248, "grad_norm": 0.0, "learning_rate": 2.493018563063558e-06, "loss": 0.0, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.1796875, "completions/mean_terminated_length": 199.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.0673372782766819, "epoch": 0.39256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.492599367500808e-06, "loss": 0.0, "num_tokens": 224695434.0, "reward": 0.4505459666252136, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4505459666252136, "rewards/reward_fn/std": 0.987565279006958, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06802962347865105, "epoch": 0.39264, "grad_norm": 0.0, "learning_rate": 2.492180119963945e-06, "loss": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 239.71875, "completions/mean_terminated_length": 230.2716064453125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.0676100067794323, "epoch": 0.39272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.491760820482313e-06, "loss": 0.0, "num_tokens": 224791654.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06601820141077042, "epoch": 0.3928, "grad_norm": 0.0, "learning_rate": 2.491341469085258e-06, "loss": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.8828125, "completions/mean_terminated_length": 225.2841033935547, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.074529729783535, "epoch": 0.39288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4909220658021297e-06, "loss": 0.0, "num_tokens": 224887255.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07480470836162567, "epoch": 0.39296, "grad_norm": 0.0, "learning_rate": 2.490502610662283e-06, "loss": 0.0, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.34375, "completions/mean_terminated_length": 218.2692413330078, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07262005284428596, "epoch": 0.39304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4900831036950743e-06, "loss": 0.0, "num_tokens": 224981635.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07632379233837128, "epoch": 0.39312, "grad_norm": 0.0, "learning_rate": 2.4896635449298655e-06, "loss": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.453125, "completions/mean_terminated_length": 235.96719360351562, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.06514991447329521, "epoch": 0.3932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4892439343960207e-06, "loss": 0.0, "num_tokens": 225078717.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06599539518356323, "epoch": 0.39328, "grad_norm": 0.0, "learning_rate": 2.488824272122908e-06, "loss": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.515625, "completions/mean_terminated_length": 216.8249969482422, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07365355640649796, "epoch": 0.39336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4884045581399e-06, "loss": 0.0, "num_tokens": 225173887.0, "reward": 0.07393992692232132, "reward_std": 0.0, "rewards/reward_fn/mean": 0.07393992692232132, "rewards/reward_fn/std": 0.19639533758163452, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07278754934668541, "epoch": 0.39344, "grad_norm": 0.0, "learning_rate": 2.4879847924763722e-06, "loss": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.8125, "completions/mean_terminated_length": 232.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.07138699293136597, "epoch": 0.39352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4875649751617035e-06, "loss": 0.0, "num_tokens": 225270247.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06978408619761467, "epoch": 0.3936, "grad_norm": 0.0, "learning_rate": 2.487145106225277e-06, "loss": 0.0, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.703125, "completions/mean_terminated_length": 236.10870361328125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07865769416093826, "epoch": 0.39368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4867251856964775e-06, "loss": 0.0, "num_tokens": 225366721.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07495075836777687, "epoch": 0.39376, "grad_norm": 0.0, "learning_rate": 2.4863052136046972e-06, "loss": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.8984375, "completions/mean_terminated_length": 221.82568359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06888153403997421, "epoch": 0.39384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.485885189979329e-06, "loss": 0.0, "num_tokens": 225461300.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06786476820707321, "epoch": 0.39392, "grad_norm": 0.0, "learning_rate": 2.4854651148497693e-06, "loss": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.5859375, "completions/mean_terminated_length": 235.62820434570312, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07094606384634972, "epoch": 0.394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.48504498824542e-06, "loss": 0.0, "num_tokens": 225558015.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0687522292137146, "epoch": 0.39408, "grad_norm": 0.0, "learning_rate": 2.4846248101956853e-06, "loss": 0.0, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.890625, "completions/mean_terminated_length": 230.88311767578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07566732168197632, "epoch": 0.39416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4842045807299725e-06, "loss": 0.0, "num_tokens": 225654385.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07232434302568436, "epoch": 0.39424, "grad_norm": 0.0, "learning_rate": 2.4837842998776936e-06, "loss": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.734375, "completions/mean_terminated_length": 215.13157653808594, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.0709606185555458, "epoch": 0.39432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.483363967668264e-06, "loss": 0.0, "num_tokens": 225749583.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06813554838299751, "epoch": 0.3944, "grad_norm": 0.0, "learning_rate": 2.4829435841311026e-06, "loss": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.8046875, "completions/mean_terminated_length": 213.0128173828125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.06538161635398865, "epoch": 0.39448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.482523149295632e-06, "loss": 0.0, "num_tokens": 225844534.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06661776453256607, "epoch": 0.39456, "grad_norm": 0.0, "learning_rate": 2.4821026631912773e-06, "loss": 0.0, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.9296875, "completions/mean_terminated_length": 195.01219177246094, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06041870079934597, "epoch": 0.39464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4816821258474693e-06, "loss": 0.0, "num_tokens": 225937837.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061515141278505325, "epoch": 0.39472, "grad_norm": 0.0, "learning_rate": 2.48126153729364e-06, "loss": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.4609375, "completions/mean_terminated_length": 223.60215759277344, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.0718977302312851, "epoch": 0.3948, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.480840897559227e-06, "loss": 0.0, "num_tokens": 226033128.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06964511796832085, "epoch": 0.39488, "grad_norm": 0.0, "learning_rate": 2.4804202066736697e-06, "loss": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.3828125, "completions/mean_terminated_length": 237.8800048828125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06316711008548737, "epoch": 0.39496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4799994646664126e-06, "loss": 0.0, "num_tokens": 226130073.0, "reward": 0.48395901918411255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.48395901918411255, "rewards/reward_fn/std": 0.996755838394165, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0659845694899559, "epoch": 0.39504, "grad_norm": 0.0, "learning_rate": 2.479578671566904e-06, "loss": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 235.6953125, "completions/mean_terminated_length": 219.3943634033203, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.0661977156996727, "epoch": 0.39512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.479157827404594e-06, "loss": 0.0, "num_tokens": 226225778.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06939908117055893, "epoch": 0.3952, "grad_norm": 0.0, "learning_rate": 2.4787369322089374e-06, "loss": 0.0, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.828125, "completions/mean_terminated_length": 192.38531494140625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06587516516447067, "epoch": 0.39528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.478315986009392e-06, "loss": 0.0, "num_tokens": 226317148.0, "reward": 0.76492840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.76492840051651, "rewards/reward_fn/std": 1.296067476272583, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06848971918225288, "epoch": 0.39536, "grad_norm": 0.0, "learning_rate": 2.47789498883542e-06, "loss": 0.0, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.8671875, "completions/mean_terminated_length": 195.83544921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0711168460547924, "epoch": 0.39544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4774739407164873e-06, "loss": 0.0, "num_tokens": 226410699.0, "reward": 0.1224115639925003, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1224115639925003, "rewards/reward_fn/std": 0.32514312863349915, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07245852425694466, "epoch": 0.39552, "grad_norm": 0.0, "learning_rate": 2.4770528416820626e-06, "loss": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.609375, "completions/mean_terminated_length": 233.093017578125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07072056084871292, "epoch": 0.3956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4766316917616173e-06, "loss": 0.0, "num_tokens": 226507033.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06853021308779716, "epoch": 0.39568, "grad_norm": 0.0, "learning_rate": 2.476210490984629e-06, "loss": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.8203125, "completions/mean_terminated_length": 198.9672088623047, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.0671270564198494, "epoch": 0.39576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4757892393805763e-06, "loss": 0.0, "num_tokens": 226601858.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06754596903920174, "epoch": 0.39584, "grad_norm": 0.0, "learning_rate": 2.475367936978943e-06, "loss": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.2734375, "completions/mean_terminated_length": 217.83871459960938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07310599461197853, "epoch": 0.39592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.474946583809214e-06, "loss": 0.0, "num_tokens": 226696613.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07339716702699661, "epoch": 0.396, "grad_norm": 0.0, "learning_rate": 2.474525179900883e-06, "loss": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.0625, "completions/mean_terminated_length": 234.440673828125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.0775233581662178, "epoch": 0.39608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4741037252834405e-06, "loss": 0.0, "num_tokens": 226793645.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07459712773561478, "epoch": 0.39616, "grad_norm": 0.0, "learning_rate": 2.4736822199863857e-06, "loss": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.2421875, "completions/mean_terminated_length": 224.30999755859375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07549848407506943, "epoch": 0.39624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4732606640392186e-06, "loss": 0.0, "num_tokens": 226888780.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0766901932656765, "epoch": 0.39632, "grad_norm": 0.0, "learning_rate": 2.4728390574714445e-06, "loss": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.421875, "completions/mean_terminated_length": 220.84375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06664624810218811, "epoch": 0.3964, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4724174003125707e-06, "loss": 0.0, "num_tokens": 226984834.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06493028998374939, "epoch": 0.39648, "grad_norm": 0.0, "learning_rate": 2.4719956925921097e-06, "loss": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.6328125, "completions/mean_terminated_length": 198.17709350585938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06943666189908981, "epoch": 0.39656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4715739343395754e-06, "loss": 0.0, "num_tokens": 227077587.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06838277354836464, "epoch": 0.39664, "grad_norm": 0.0, "learning_rate": 2.4711521255844866e-06, "loss": 0.0, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.203125, "completions/mean_terminated_length": 226.86154174804688, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06490853056311607, "epoch": 0.39672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4707302663563665e-06, "loss": 0.0, "num_tokens": 227173997.0, "reward": 0.04093467444181442, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04093467444181442, "rewards/reward_fn/std": 0.10872851312160492, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06701796501874924, "epoch": 0.3968, "grad_norm": 0.0, "learning_rate": 2.47030835668474e-06, "loss": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.1171875, "completions/mean_terminated_length": 209.60606384277344, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07116977497935295, "epoch": 0.39688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.469886396599136e-06, "loss": 0.0, "num_tokens": 227267708.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696757473051548, "epoch": 0.39696, "grad_norm": 0.0, "learning_rate": 2.4694643861290883e-06, "loss": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.015625, "completions/mean_terminated_length": 242.79310607910156, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.0701042152941227, "epoch": 0.39704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4690423253041324e-06, "loss": 0.0, "num_tokens": 227365246.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06780759245157242, "epoch": 0.39712, "grad_norm": 0.0, "learning_rate": 2.468620214153808e-06, "loss": 0.0, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.828125, "completions/mean_terminated_length": 224.4666748046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.059829747304320335, "epoch": 0.3972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4681980527076584e-06, "loss": 0.0, "num_tokens": 227460712.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0627024807035923, "epoch": 0.39728, "grad_norm": 0.0, "learning_rate": 2.467775840995231e-06, "loss": 0.0, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 228.65753173828125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06756351888179779, "epoch": 0.39736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.467353579046076e-06, "loss": 0.0, "num_tokens": 227557020.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06549370288848877, "epoch": 0.39744, "grad_norm": 0.0, "learning_rate": 2.4669312668897473e-06, "loss": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.9765625, "completions/mean_terminated_length": 214.60000610351562, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.06225335970520973, "epoch": 0.39752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4665089045558024e-06, "loss": 0.0, "num_tokens": 227652633.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06563568115234375, "epoch": 0.3976, "grad_norm": 0.0, "learning_rate": 2.466086492073801e-06, "loss": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 220.86956787109375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.06809297949075699, "epoch": 0.39768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4656640294733087e-06, "loss": 0.0, "num_tokens": 227749321.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0639190599322319, "epoch": 0.39776, "grad_norm": 0.0, "learning_rate": 2.4652415167838934e-06, "loss": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.7734375, "completions/mean_terminated_length": 211.45899963378906, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07348395884037018, "epoch": 0.39784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4648189540351265e-06, "loss": 0.0, "num_tokens": 227844908.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06854179501533508, "epoch": 0.39792, "grad_norm": 0.0, "learning_rate": 2.4643963412565826e-06, "loss": 0.0, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.8125, "completions/mean_terminated_length": 236.37208557128906, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06511135026812553, "epoch": 0.398, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.46397367847784e-06, "loss": 0.0, "num_tokens": 227941524.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668795183300972, "epoch": 0.39808, "grad_norm": 0.0, "learning_rate": 2.4635509657284813e-06, "loss": 0.0, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.0234375, "completions/mean_terminated_length": 228.68931579589844, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.07225341349840164, "epoch": 0.39816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.463128203038091e-06, "loss": 0.0, "num_tokens": 228037015.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07122248038649559, "epoch": 0.39824, "grad_norm": 0.0, "learning_rate": 2.462705390436259e-06, "loss": 0.0, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.1796875, "completions/mean_terminated_length": 204.13580322265625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07226788252592087, "epoch": 0.39832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.462282527952577e-06, "loss": 0.0, "num_tokens": 228131118.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06928277015686035, "epoch": 0.3984, "grad_norm": 0.0, "learning_rate": 2.461859615616642e-06, "loss": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.09375, "completions/mean_terminated_length": 209.686279296875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07153349369764328, "epoch": 0.39848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.461436653458052e-06, "loss": 0.0, "num_tokens": 228224698.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07166092097759247, "epoch": 0.39856, "grad_norm": 0.0, "learning_rate": 2.4610136415064097e-06, "loss": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 244.859375, "completions/mean_terminated_length": 235.62857055664062, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06623497605323792, "epoch": 0.39864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.460590579791323e-06, "loss": 0.0, "num_tokens": 228321576.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06566506996750832, "epoch": 0.39872, "grad_norm": 0.0, "learning_rate": 2.4601674683424007e-06, "loss": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.234375, "completions/mean_terminated_length": 229.08509826660156, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07447238266468048, "epoch": 0.3988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.459744307189256e-06, "loss": 0.0, "num_tokens": 228417350.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07076051831245422, "epoch": 0.39888, "grad_norm": 0.0, "learning_rate": 2.459321096361507e-06, "loss": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.296875, "completions/mean_terminated_length": 213.27906799316406, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.0706622451543808, "epoch": 0.39896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4588978358887728e-06, "loss": 0.0, "num_tokens": 228511980.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06614904850721359, "epoch": 0.39904, "grad_norm": 0.0, "learning_rate": 2.458474525800678e-06, "loss": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.6875, "completions/mean_terminated_length": 231.6483612060547, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.07700890302658081, "epoch": 0.39912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.458051166126848e-06, "loss": 0.0, "num_tokens": 228608068.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07159815728664398, "epoch": 0.3992, "grad_norm": 0.0, "learning_rate": 2.4576277568969154e-06, "loss": 0.0, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.3671875, "completions/mean_terminated_length": 231.12307739257812, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06767504289746284, "epoch": 0.39928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4572042981405135e-06, "loss": 0.0, "num_tokens": 228704755.0, "reward": 0.0934104323387146, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0934104323387146, "rewards/reward_fn/std": 0.17312297224998474, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06582129001617432, "epoch": 0.39936, "grad_norm": 0.0, "learning_rate": 2.456780789887281e-06, "loss": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.3671875, "completions/mean_terminated_length": 209.33685302734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06112036854028702, "epoch": 0.39944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4563572321668576e-06, "loss": 0.0, "num_tokens": 228798626.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060764698311686516, "epoch": 0.39952, "grad_norm": 0.0, "learning_rate": 2.455933625008889e-06, "loss": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.890625, "completions/mean_terminated_length": 191.0602264404297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07011161372065544, "epoch": 0.3996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4555099684430226e-06, "loss": 0.0, "num_tokens": 228891540.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06803259626030922, "epoch": 0.39968, "grad_norm": 0.0, "learning_rate": 2.4550862624989095e-06, "loss": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 235.6962127685547, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06779169663786888, "epoch": 0.39976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4546625072062055e-06, "loss": 0.0, "num_tokens": 228988240.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07020226866006851, "epoch": 0.39984, "grad_norm": 0.0, "learning_rate": 2.4542387025945686e-06, "loss": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.3828125, "completions/mean_terminated_length": 224.19540405273438, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07524699345231056, "epoch": 0.39992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.453814848693661e-06, "loss": 0.0, "num_tokens": 229083777.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07836748659610748, "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 2.4533909455331473e-06, "loss": 0.0, "step": 5000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.4296875, "completions/mean_terminated_length": 241.0404052734375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.0707600973546505, "epoch": 0.40008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4529669931426968e-06, "loss": 0.0, "num_tokens": 229180600.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07097424566745758, "epoch": 0.40016, "grad_norm": 0.0, "learning_rate": 2.452542991551981e-06, "loss": 0.0, "step": 5002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.5390625, "completions/mean_terminated_length": 236.6607208251953, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.0694497637450695, "epoch": 0.40024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4521189407906763e-06, "loss": 0.0, "num_tokens": 229277821.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06955979764461517, "epoch": 0.40032, "grad_norm": 0.0, "learning_rate": 2.4516948408884623e-06, "loss": 0.0, "step": 5004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.0546875, "completions/mean_terminated_length": 222.7899932861328, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06332508847117424, "epoch": 0.4004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.45127069187502e-06, "loss": 0.0, "num_tokens": 229372804.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06334640830755234, "epoch": 0.40048, "grad_norm": 0.0, "learning_rate": 2.450846493780036e-06, "loss": 0.0, "step": 5006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.65625, "completions/mean_terminated_length": 230.8205108642578, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06835523247718811, "epoch": 0.40056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4504222466332e-06, "loss": 0.0, "num_tokens": 229469144.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06770160049200058, "epoch": 0.40064, "grad_norm": 0.0, "learning_rate": 2.4499979504642044e-06, "loss": 0.0, "step": 5008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 230.9873504638672, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07392498478293419, "epoch": 0.40072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4495736053027458e-06, "loss": 0.0, "num_tokens": 229565472.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07177344709634781, "epoch": 0.4008, "grad_norm": 0.0, "learning_rate": 2.4491492111785244e-06, "loss": 0.0, "step": 5010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.390625, "completions/mean_terminated_length": 178.95411682128906, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07789919525384903, "epoch": 0.40088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4487247681212413e-06, "loss": 0.0, "num_tokens": 229655378.0, "reward": 0.799616277217865, "reward_std": 0.0, "rewards/reward_fn/mean": 0.799616277217865, "rewards/reward_fn/std": 1.281852126121521, "step": 5011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07538295909762383, "epoch": 0.40096, "grad_norm": 0.0, "learning_rate": 2.448300276160605e-06, "loss": 0.0, "step": 5012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.0078125, "completions/mean_terminated_length": 227.84999084472656, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.06472842395305634, "epoch": 0.40104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.447875735326325e-06, "loss": 0.0, "num_tokens": 229750867.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06649596616625786, "epoch": 0.40112, "grad_norm": 0.0, "learning_rate": 2.4474511456481144e-06, "loss": 0.0, "step": 5014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.796875, "completions/mean_terminated_length": 233.1058807373047, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06229483336210251, "epoch": 0.4012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.44702650715569e-06, "loss": 0.0, "num_tokens": 229847225.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06519853323698044, "epoch": 0.40128, "grad_norm": 0.0, "learning_rate": 2.4466018198787725e-06, "loss": 0.0, "step": 5016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.40625, "completions/mean_terminated_length": 213.43820190429688, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06729724258184433, "epoch": 0.40136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4461770838470846e-06, "loss": 0.0, "num_tokens": 229941741.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06847164034843445, "epoch": 0.40144, "grad_norm": 0.0, "learning_rate": 2.4457522990903537e-06, "loss": 0.0, "step": 5018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.765625, "completions/mean_terminated_length": 232.21949768066406, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.07310934737324715, "epoch": 0.40152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.44532746563831e-06, "loss": 0.0, "num_tokens": 230038095.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07695171236991882, "epoch": 0.4016, "grad_norm": 0.0, "learning_rate": 2.4449025835206888e-06, "loss": 0.0, "step": 5020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.3046875, "completions/mean_terminated_length": 215.89022827148438, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.0676555335521698, "epoch": 0.40168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.444477652767226e-06, "loss": 0.0, "num_tokens": 230133110.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06792338192462921, "epoch": 0.40176, "grad_norm": 0.0, "learning_rate": 2.4440526734076627e-06, "loss": 0.0, "step": 5022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.3125, "completions/mean_terminated_length": 210.56410217285156, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06604393571615219, "epoch": 0.40184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4436276454717427e-06, "loss": 0.0, "num_tokens": 230227870.0, "reward": 0.8688493967056274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8688493967056274, "rewards/reward_fn/std": 1.27309250831604, "step": 5023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06408389657735825, "epoch": 0.40192, "grad_norm": 0.0, "learning_rate": 2.443202568989213e-06, "loss": 0.0, "step": 5024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 223.640625, "completions/mean_terminated_length": 204.22500610351562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.0596056692302227, "epoch": 0.402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.442777443989826e-06, "loss": 0.0, "num_tokens": 230322032.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05920649319887161, "epoch": 0.40208, "grad_norm": 0.0, "learning_rate": 2.4423522705033345e-06, "loss": 0.0, "step": 5026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.046875, "completions/mean_terminated_length": 241.45713806152344, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06849075108766556, "epoch": 0.40216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4419270485594967e-06, "loss": 0.0, "num_tokens": 230419318.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07029031589627266, "epoch": 0.40224, "grad_norm": 0.0, "learning_rate": 2.441501778188074e-06, "loss": 0.0, "step": 5028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.5234375, "completions/mean_terminated_length": 201.87911987304688, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06524159386754036, "epoch": 0.40232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4410764594188303e-06, "loss": 0.0, "num_tokens": 230512697.0, "reward": 0.4246163070201874, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4246163070201874, "rewards/reward_fn/std": 0.9858949184417725, "step": 5029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06815880537033081, "epoch": 0.4024, "grad_norm": 0.0, "learning_rate": 2.4406510922815337e-06, "loss": 0.0, "step": 5030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.203125, "completions/mean_terminated_length": 239.90321350097656, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07440843433141708, "epoch": 0.40248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.440225676805955e-06, "loss": 0.0, "num_tokens": 230610003.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07341177016496658, "epoch": 0.40256, "grad_norm": 0.0, "learning_rate": 2.439800213021869e-06, "loss": 0.0, "step": 5032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.9921875, "completions/mean_terminated_length": 196.14285278320312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.07476205378770828, "epoch": 0.40264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4393747009590543e-06, "loss": 0.0, "num_tokens": 230703698.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06732030585408211, "epoch": 0.40272, "grad_norm": 0.0, "learning_rate": 2.438949140647291e-06, "loss": 0.0, "step": 5034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.296875, "completions/mean_terminated_length": 207.43138122558594, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.07611670717597008, "epoch": 0.4028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4385235321163652e-06, "loss": 0.0, "num_tokens": 230797048.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07242871820926666, "epoch": 0.40288, "grad_norm": 0.0, "learning_rate": 2.4380978753960637e-06, "loss": 0.0, "step": 5036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1328125, "completions/mean_terminated_length": 232.471435546875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.06936214864253998, "epoch": 0.40296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.437672170516179e-06, "loss": 0.0, "num_tokens": 230893705.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06848963350057602, "epoch": 0.40304, "grad_norm": 0.0, "learning_rate": 2.4372464175065057e-06, "loss": 0.0, "step": 5038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 240.63999938964844, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.07097409665584564, "epoch": 0.40312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4368206163968412e-06, "loss": 0.0, "num_tokens": 230990857.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07006650045514107, "epoch": 0.4032, "grad_norm": 0.0, "learning_rate": 2.436394767216988e-06, "loss": 0.0, "step": 5040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.046875, "completions/mean_terminated_length": 238.63636779785156, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.06595724076032639, "epoch": 0.40328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4359688699967516e-06, "loss": 0.0, "num_tokens": 231088015.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06746267154812813, "epoch": 0.40336, "grad_norm": 0.0, "learning_rate": 2.435542924765939e-06, "loss": 0.0, "step": 5042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.015625, "completions/mean_terminated_length": 226.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.0690922960639, "epoch": 0.40344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.435116931554362e-06, "loss": 0.0, "num_tokens": 231183505.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06862930208444595, "epoch": 0.40352, "grad_norm": 0.0, "learning_rate": 2.434690890391836e-06, "loss": 0.0, "step": 5044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.78125, "completions/mean_terminated_length": 228.35955810546875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06885618716478348, "epoch": 0.4036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.43426480130818e-06, "loss": 0.0, "num_tokens": 231279349.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06876382231712341, "epoch": 0.40368, "grad_norm": 0.0, "learning_rate": 2.4338386643332147e-06, "loss": 0.0, "step": 5046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 248.4453125, "completions/mean_terminated_length": 237.0392303466797, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.07445081323385239, "epoch": 0.40376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4334124794967657e-06, "loss": 0.0, "num_tokens": 231376686.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07653619721531868, "epoch": 0.40384, "grad_norm": 0.0, "learning_rate": 2.4329862468286613e-06, "loss": 0.0, "step": 5048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.296875, "completions/mean_terminated_length": 211.61038208007812, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07581010833382607, "epoch": 0.40392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4325599663587333e-06, "loss": 0.0, "num_tokens": 231471572.0, "reward": 0.8206124305725098, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8206124305725098, "rewards/reward_fn/std": 1.2764060497283936, "step": 5049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07260218262672424, "epoch": 0.404, "grad_norm": 0.0, "learning_rate": 2.4321336381168167e-06, "loss": 0.0, "step": 5050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.609375, "completions/mean_terminated_length": 213.79220581054688, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07069966942071915, "epoch": 0.40408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4317072621327506e-06, "loss": 0.0, "num_tokens": 231566626.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07380310818552971, "epoch": 0.40416, "grad_norm": 0.0, "learning_rate": 2.4312808384363758e-06, "loss": 0.0, "step": 5052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.8203125, "completions/mean_terminated_length": 227.7604217529297, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.0689440593123436, "epoch": 0.40424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.430854367057538e-06, "loss": 0.0, "num_tokens": 231662219.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06843571364879608, "epoch": 0.40432, "grad_norm": 0.0, "learning_rate": 2.4304278480260864e-06, "loss": 0.0, "step": 5054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.2109375, "completions/mean_terminated_length": 231.29884338378906, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07175258547067642, "epoch": 0.4044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4300012813718715e-06, "loss": 0.0, "num_tokens": 231758374.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07309219613671303, "epoch": 0.40448, "grad_norm": 0.0, "learning_rate": 2.429574667124749e-06, "loss": 0.0, "step": 5056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.9375, "completions/mean_terminated_length": 237.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06978216022253036, "epoch": 0.40456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.429148005314578e-06, "loss": 0.0, "num_tokens": 231855006.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07112500071525574, "epoch": 0.40464, "grad_norm": 0.0, "learning_rate": 2.428721295971219e-06, "loss": 0.0, "step": 5058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.7421875, "completions/mean_terminated_length": 209.08621215820312, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.07394179329276085, "epoch": 0.40472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4282945391245387e-06, "loss": 0.0, "num_tokens": 231950589.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07396428287029266, "epoch": 0.4048, "grad_norm": 0.0, "learning_rate": 2.427867734804404e-06, "loss": 0.0, "step": 5060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.9765625, "completions/mean_terminated_length": 233.10714721679688, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06998688727617264, "epoch": 0.40488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4274408830406885e-06, "loss": 0.0, "num_tokens": 232046970.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06871187686920166, "epoch": 0.40496, "grad_norm": 0.0, "learning_rate": 2.4270139838632653e-06, "loss": 0.0, "step": 5062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.1953125, "completions/mean_terminated_length": 231.7051239013672, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06914785504341125, "epoch": 0.40504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4265870373020148e-06, "loss": 0.0, "num_tokens": 232143379.0, "reward": 0.0982079803943634, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0982079803943634, "rewards/reward_fn/std": 0.2608548700809479, "step": 5063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663042850792408, "epoch": 0.40512, "grad_norm": 0.0, "learning_rate": 2.4261600433868172e-06, "loss": 0.0, "step": 5064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.34375, "completions/mean_terminated_length": 234.40000915527344, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06843585893511772, "epoch": 0.4052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4257330021475586e-06, "loss": 0.0, "num_tokens": 232240063.0, "reward": 0.10006237775087357, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10006237775087357, "rewards/reward_fn/std": 0.26578041911125183, "step": 5065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06756250560283661, "epoch": 0.40528, "grad_norm": 0.0, "learning_rate": 2.425305913614126e-06, "loss": 0.0, "step": 5066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.0546875, "completions/mean_terminated_length": 208.08750915527344, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.058187514543533325, "epoch": 0.40536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.424878777816413e-06, "loss": 0.0, "num_tokens": 232334534.0, "reward": 0.06713119894266129, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06713119894266129, "rewards/reward_fn/std": 0.1783103495836258, "step": 5067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05944269150495529, "epoch": 0.40544, "grad_norm": 0.0, "learning_rate": 2.4244515947843138e-06, "loss": 0.0, "step": 5068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.8203125, "completions/mean_terminated_length": 205.9027862548828, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07295995578169823, "epoch": 0.40552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4240243645477253e-06, "loss": 0.0, "num_tokens": 232429231.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07415252178907394, "epoch": 0.4056, "grad_norm": 0.0, "learning_rate": 2.4235970871365515e-06, "loss": 0.0, "step": 5070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.6796875, "completions/mean_terminated_length": 230.532470703125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.07504136115312576, "epoch": 0.40568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.423169762580696e-06, "loss": 0.0, "num_tokens": 232525574.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0733790397644043, "epoch": 0.40576, "grad_norm": 0.0, "learning_rate": 2.4227423909100665e-06, "loss": 0.0, "step": 5072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.171875, "completions/mean_terminated_length": 208.27500915527344, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05671374872326851, "epoch": 0.40584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4223149721545752e-06, "loss": 0.0, "num_tokens": 232620060.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06090288609266281, "epoch": 0.40592, "grad_norm": 0.0, "learning_rate": 2.4218875063441366e-06, "loss": 0.0, "step": 5074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.1796875, "completions/mean_terminated_length": 214.16456604003906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06508385203778744, "epoch": 0.406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.42145999350867e-06, "loss": 0.0, "num_tokens": 232715059.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06862446665763855, "epoch": 0.40608, "grad_norm": 0.0, "learning_rate": 2.421032433678095e-06, "loss": 0.0, "step": 5076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.015625, "completions/mean_terminated_length": 220.37974548339844, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06576738506555557, "epoch": 0.40616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.420604826882338e-06, "loss": 0.0, "num_tokens": 232810549.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0656275674700737, "epoch": 0.40624, "grad_norm": 0.0, "learning_rate": 2.4201771731513252e-06, "loss": 0.0, "step": 5078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.6796875, "completions/mean_terminated_length": 224.25555419921875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.07008398696780205, "epoch": 0.40632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4197494725149886e-06, "loss": 0.0, "num_tokens": 232905996.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07011592760682106, "epoch": 0.4064, "grad_norm": 0.0, "learning_rate": 2.4193217250032634e-06, "loss": 0.0, "step": 5080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 225.5157928466797, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07053124904632568, "epoch": 0.40648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4188939306460867e-06, "loss": 0.0, "num_tokens": 233001404.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07405427470803261, "epoch": 0.40656, "grad_norm": 0.0, "learning_rate": 2.4184660894733994e-06, "loss": 0.0, "step": 5082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.484375, "completions/mean_terminated_length": 239.760009765625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06254028156399727, "epoch": 0.40664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4180382015151468e-06, "loss": 0.0, "num_tokens": 233098490.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06226181238889694, "epoch": 0.40672, "grad_norm": 0.0, "learning_rate": 2.4176102668012755e-06, "loss": 0.0, "step": 5084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.2578125, "completions/mean_terminated_length": 236.1097412109375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07379240170121193, "epoch": 0.4068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4171822853617368e-06, "loss": 0.0, "num_tokens": 233195163.0, "reward": 0.09413323551416397, "reward_std": 0.0, "rewards/reward_fn/mean": 0.09413323551416397, "rewards/reward_fn/std": 0.2500317096710205, "step": 5085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07218865677714348, "epoch": 0.40688, "grad_norm": 0.0, "learning_rate": 2.416754257226484e-06, "loss": 0.0, "step": 5086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.765625, "completions/mean_terminated_length": 240.84616088867188, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06191800348460674, "epoch": 0.40696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.416326182425477e-06, "loss": 0.0, "num_tokens": 233292285.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06318377330899239, "epoch": 0.40704, "grad_norm": 0.0, "learning_rate": 2.4158980609886744e-06, "loss": 0.0, "step": 5088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.7421875, "completions/mean_terminated_length": 233.13186645507812, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07389472797513008, "epoch": 0.40712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4154698929460406e-06, "loss": 0.0, "num_tokens": 233388508.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07468746975064278, "epoch": 0.4072, "grad_norm": 0.0, "learning_rate": 2.415041678327543e-06, "loss": 0.0, "step": 5090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.6015625, "completions/mean_terminated_length": 224.71429443359375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06714208796620369, "epoch": 0.40728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.414613417163152e-06, "loss": 0.0, "num_tokens": 233484841.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 5091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06655583903193474, "epoch": 0.40736, "grad_norm": 0.0, "learning_rate": 2.4141851094828416e-06, "loss": 0.0, "step": 5092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.9375, "completions/mean_terminated_length": 217.51112365722656, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07145945727825165, "epoch": 0.40744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4137567553165883e-06, "loss": 0.0, "num_tokens": 233579681.0, "reward": 0.48206061124801636, "reward_std": 0.0, "rewards/reward_fn/mean": 0.48206061124801636, "rewards/reward_fn/std": 0.9960240125656128, "step": 5093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06907295808196068, "epoch": 0.40752, "grad_norm": 0.0, "learning_rate": 2.4133283546943735e-06, "loss": 0.0, "step": 5094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.9765625, "completions/mean_terminated_length": 223.9605255126953, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.060134151950478554, "epoch": 0.4076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4128999076461794e-06, "loss": 0.0, "num_tokens": 233675550.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06117971986532211, "epoch": 0.40768, "grad_norm": 0.0, "learning_rate": 2.4124714142019937e-06, "loss": 0.0, "step": 5096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.140625, "completions/mean_terminated_length": 222.56565856933594, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07349549606442451, "epoch": 0.40776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.412042874391805e-06, "loss": 0.0, "num_tokens": 233770544.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07450282946228981, "epoch": 0.40784, "grad_norm": 0.0, "learning_rate": 2.4116142882456083e-06, "loss": 0.0, "step": 5098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.84375, "completions/mean_terminated_length": 242.86668395996094, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.07142031192779541, "epoch": 0.40792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4111856557933993e-06, "loss": 0.0, "num_tokens": 233868060.0, "reward": 0.02706475742161274, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02706475742161274, "rewards/reward_fn/std": 0.07188798487186432, "step": 5099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07019533962011337, "epoch": 0.408, "grad_norm": 0.0, "learning_rate": 2.4107569770651783e-06, "loss": 0.0, "step": 5100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 239.8279571533203, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06622251123189926, "epoch": 0.40808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4103282520909476e-06, "loss": 0.0, "num_tokens": 233964860.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06418558210134506, "epoch": 0.40816, "grad_norm": 0.0, "learning_rate": 2.4098994809007135e-06, "loss": 0.0, "step": 5102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.3203125, "completions/mean_terminated_length": 231.3505096435547, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.0677747055888176, "epoch": 0.40824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4094706635244853e-06, "loss": 0.0, "num_tokens": 234060773.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06834514811635017, "epoch": 0.40832, "grad_norm": 0.0, "learning_rate": 2.4090417999922765e-06, "loss": 0.0, "step": 5104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.8359375, "completions/mean_terminated_length": 239.7083282470703, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07524849474430084, "epoch": 0.4084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4086128903341016e-06, "loss": 0.0, "num_tokens": 234157904.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07415113598108292, "epoch": 0.40848, "grad_norm": 0.0, "learning_rate": 2.408183934579982e-06, "loss": 0.0, "step": 5106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.7734375, "completions/mean_terminated_length": 220.1807098388672, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.06897981464862823, "epoch": 0.40856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.407754932759938e-06, "loss": 0.0, "num_tokens": 234253235.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06782380491495132, "epoch": 0.40864, "grad_norm": 0.0, "learning_rate": 2.407325884903996e-06, "loss": 0.0, "step": 5108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 251.0234375, "completions/mean_terminated_length": 242.7291717529297, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07016133144497871, "epoch": 0.40872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4068967910421845e-06, "loss": 0.0, "num_tokens": 234350902.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06946840509772301, "epoch": 0.4088, "grad_norm": 0.0, "learning_rate": 2.4064676512045363e-06, "loss": 0.0, "step": 5110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.15625, "completions/mean_terminated_length": 238.84849548339844, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06885464861989021, "epoch": 0.40888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4060384654210855e-06, "loss": 0.0, "num_tokens": 234448074.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07156038656830788, "epoch": 0.40896, "grad_norm": 0.0, "learning_rate": 2.4056092337218714e-06, "loss": 0.0, "step": 5112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.90625, "completions/mean_terminated_length": 214.87379455566406, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07938030362129211, "epoch": 0.40904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4051799561369356e-06, "loss": 0.0, "num_tokens": 234542142.0, "reward": 0.13511891663074493, "reward_std": 0.0, "rewards/reward_fn/mean": 0.13511891663074493, "rewards/reward_fn/std": 0.2804263234138489, "step": 5113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0750100277364254, "epoch": 0.40912, "grad_norm": 0.0, "learning_rate": 2.4047506326963224e-06, "loss": 0.0, "step": 5114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.453125, "completions/mean_terminated_length": 219.47222900390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07354680448770523, "epoch": 0.4092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.404321263430081e-06, "loss": 0.0, "num_tokens": 234637816.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 5115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07813557982444763, "epoch": 0.40928, "grad_norm": 0.0, "learning_rate": 2.403891848368262e-06, "loss": 0.0, "step": 5116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.328125, "completions/mean_terminated_length": 218.59259033203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07502302527427673, "epoch": 0.40936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.40346238754092e-06, "loss": 0.0, "num_tokens": 234733090.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07534231245517731, "epoch": 0.40944, "grad_norm": 0.0, "learning_rate": 2.4030328809781125e-06, "loss": 0.0, "step": 5118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.78125, "completions/mean_terminated_length": 215.3478240966797, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07036057114601135, "epoch": 0.40952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.4026033287099007e-06, "loss": 0.0, "num_tokens": 234827654.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06865577772259712, "epoch": 0.4096, "grad_norm": 0.0, "learning_rate": 2.4021737307663487e-06, "loss": 0.0, "step": 5120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.9296875, "completions/mean_terminated_length": 229.8985595703125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06580943241715431, "epoch": 0.40968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.401744087177524e-06, "loss": 0.0, "num_tokens": 234924157.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.068660669028759, "epoch": 0.40976, "grad_norm": 0.0, "learning_rate": 2.401314397973497e-06, "loss": 0.0, "step": 5122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.3359375, "completions/mean_terminated_length": 238.75531005859375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07199511304497719, "epoch": 0.40984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.400884663184341e-06, "loss": 0.0, "num_tokens": 235020840.0, "reward": 0.06319478154182434, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06319478154182434, "rewards/reward_fn/std": 0.11610779166221619, "step": 5123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07070527225732803, "epoch": 0.40992, "grad_norm": 0.0, "learning_rate": 2.4004548828401336e-06, "loss": 0.0, "step": 5124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.0625, "completions/mean_terminated_length": 194.93333435058594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06563414260745049, "epoch": 0.41, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.400025056970954e-06, "loss": 0.0, "num_tokens": 235113648.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 5125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06774784624576569, "epoch": 0.41008, "grad_norm": 0.0, "learning_rate": 2.399595185606886e-06, "loss": 0.0, "step": 5126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.390625, "completions/mean_terminated_length": 221.00990295410156, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.07668270170688629, "epoch": 0.41016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3991652687780163e-06, "loss": 0.0, "num_tokens": 235208418.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273335456848, "step": 5127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0723930224776268, "epoch": 0.41024, "grad_norm": 0.0, "learning_rate": 2.398735306514435e-06, "loss": 0.0, "step": 5128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 197.9375, "completions/mean_terminated_length": 187.81651306152344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07136297971010208, "epoch": 0.41032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3983052988462333e-06, "loss": 0.0, "num_tokens": 235299290.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 5129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0658223144710064, "epoch": 0.4104, "grad_norm": 0.0, "learning_rate": 2.397875245803508e-06, "loss": 0.0, "step": 5130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.390625, "completions/mean_terminated_length": 226.6888885498047, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06984524428844452, "epoch": 0.41048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3974451474163583e-06, "loss": 0.0, "num_tokens": 235394956.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06997695937752724, "epoch": 0.41056, "grad_norm": 0.0, "learning_rate": 2.397015003714887e-06, "loss": 0.0, "step": 5132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.0703125, "completions/mean_terminated_length": 199.22222900390625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06918352097272873, "epoch": 0.41064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.396584814729199e-06, "loss": 0.0, "num_tokens": 235488661.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07112092897295952, "epoch": 0.41072, "grad_norm": 0.0, "learning_rate": 2.396154580489403e-06, "loss": 0.0, "step": 5134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 247.6015625, "completions/mean_terminated_length": 239.46153259277344, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.07568441331386566, "epoch": 0.4108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.395724301025611e-06, "loss": 0.0, "num_tokens": 235585890.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07402880489826202, "epoch": 0.41088, "grad_norm": 0.0, "learning_rate": 2.395293976367939e-06, "loss": 0.0, "step": 5136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 187.1796875, "completions/mean_terminated_length": 157.02247619628906, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.07087850570678711, "epoch": 0.41096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3948636065465036e-06, "loss": 0.0, "num_tokens": 235675385.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07042743638157845, "epoch": 0.41104, "grad_norm": 0.0, "learning_rate": 2.394433191591427e-06, "loss": 0.0, "step": 5138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.796875, "completions/mean_terminated_length": 230.67391967773438, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06736624240875244, "epoch": 0.41112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3940027315328326e-06, "loss": 0.0, "num_tokens": 235771359.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07188098877668381, "epoch": 0.4112, "grad_norm": 0.0, "learning_rate": 2.3935722264008495e-06, "loss": 0.0, "step": 5140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 220.6593475341797, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07045550271868706, "epoch": 0.41128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3931416762256085e-06, "loss": 0.0, "num_tokens": 235866447.0, "reward": 0.4799567759037018, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4799567759037018, "rewards/reward_fn/std": 0.9952422976493835, "step": 5141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07342811673879623, "epoch": 0.41136, "grad_norm": 0.0, "learning_rate": 2.3927110810372426e-06, "loss": 0.0, "step": 5142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.203125, "completions/mean_terminated_length": 232.90243530273438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06546871364116669, "epoch": 0.41144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3922804408658897e-06, "loss": 0.0, "num_tokens": 235962857.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0657946765422821, "epoch": 0.41152, "grad_norm": 0.0, "learning_rate": 2.391849755741689e-06, "loss": 0.0, "step": 5144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.3359375, "completions/mean_terminated_length": 193.9431915283203, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06742998212575912, "epoch": 0.4116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.391419025694785e-06, "loss": 0.0, "num_tokens": 236055700.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06536389514803886, "epoch": 0.41168, "grad_norm": 0.0, "learning_rate": 2.3909882507553234e-06, "loss": 0.0, "step": 5146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.1015625, "completions/mean_terminated_length": 213.92552185058594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07643124833703041, "epoch": 0.41176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3905574309534554e-06, "loss": 0.0, "num_tokens": 236150049.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07079453021287918, "epoch": 0.41184, "grad_norm": 0.0, "learning_rate": 2.390126566319332e-06, "loss": 0.0, "step": 5148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.5625, "completions/mean_terminated_length": 226.63157653808594, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07322941720485687, "epoch": 0.41192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3896956568831103e-06, "loss": 0.0, "num_tokens": 236246121.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 5149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07032131031155586, "epoch": 0.412, "grad_norm": 0.0, "learning_rate": 2.3892647026749487e-06, "loss": 0.0, "step": 5150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.953125, "completions/mean_terminated_length": 233.73077392578125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.06762632727622986, "epoch": 0.41208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.38883370372501e-06, "loss": 0.0, "num_tokens": 236343267.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07169536501169205, "epoch": 0.41216, "grad_norm": 0.0, "learning_rate": 2.3884026600634595e-06, "loss": 0.0, "step": 5152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.078125, "completions/mean_terminated_length": 236.3095245361328, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06681683659553528, "epoch": 0.41224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.387971571720465e-06, "loss": 0.0, "num_tokens": 236439917.0, "reward": 0.49947303533554077, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49947303533554077, "rewards/reward_fn/std": 1.0036656856536865, "step": 5153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06809370219707489, "epoch": 0.41232, "grad_norm": 0.0, "learning_rate": 2.3875404387262003e-06, "loss": 0.0, "step": 5154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.203125, "completions/mean_terminated_length": 226.53488159179688, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.07184894010424614, "epoch": 0.4124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3871092611108377e-06, "loss": 0.0, "num_tokens": 236535687.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06860378384590149, "epoch": 0.41248, "grad_norm": 0.0, "learning_rate": 2.3866780389045555e-06, "loss": 0.0, "step": 5156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.578125, "completions/mean_terminated_length": 207.7659454345703, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07028684765100479, "epoch": 0.41256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3862467721375357e-06, "loss": 0.0, "num_tokens": 236629457.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07043134793639183, "epoch": 0.41264, "grad_norm": 0.0, "learning_rate": 2.3858154608399613e-06, "loss": 0.0, "step": 5158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.0546875, "completions/mean_terminated_length": 221.01612854003906, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07008632272481918, "epoch": 0.41272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3853841050420204e-06, "loss": 0.0, "num_tokens": 236725592.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07261833176016808, "epoch": 0.4128, "grad_norm": 0.0, "learning_rate": 2.3849527047739035e-06, "loss": 0.0, "step": 5160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.8828125, "completions/mean_terminated_length": 223.7916717529297, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.0784405954182148, "epoch": 0.41288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3845212600658033e-06, "loss": 0.0, "num_tokens": 236821577.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07721319794654846, "epoch": 0.41296, "grad_norm": 0.0, "learning_rate": 2.384089770947917e-06, "loss": 0.0, "step": 5162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.1796875, "completions/mean_terminated_length": 188.3333282470703, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.060123056173324585, "epoch": 0.41304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.383658237450444e-06, "loss": 0.0, "num_tokens": 236914400.0, "reward": 0.7931517362594604, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7931517362594604, "rewards/reward_fn/std": 1.2840120792388916, "step": 5163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06056319177150726, "epoch": 0.41312, "grad_norm": 0.0, "learning_rate": 2.383226659603586e-06, "loss": 0.0, "step": 5164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.703125, "completions/mean_terminated_length": 222.33999633789062, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.06612031161785126, "epoch": 0.4132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.382795037437551e-06, "loss": 0.0, "num_tokens": 237009338.0, "reward": 0.11952967941761017, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11952967941761017, "rewards/reward_fn/std": 0.317488431930542, "step": 5165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06516583263874054, "epoch": 0.41328, "grad_norm": 0.0, "learning_rate": 2.382363370982547e-06, "loss": 0.0, "step": 5166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 229.64706420898438, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.08027011156082153, "epoch": 0.41336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3819316602687857e-06, "loss": 0.0, "num_tokens": 237106298.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 5167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07990230992436409, "epoch": 0.41344, "grad_norm": 0.0, "learning_rate": 2.3814999053264825e-06, "loss": 0.0, "step": 5168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.109375, "completions/mean_terminated_length": 220.44186401367188, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07606149837374687, "epoch": 0.41352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3810681061858565e-06, "loss": 0.0, "num_tokens": 237201544.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07240220159292221, "epoch": 0.4136, "grad_norm": 0.0, "learning_rate": 2.3806362628771278e-06, "loss": 0.0, "step": 5170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.40625, "completions/mean_terminated_length": 222.6428680419922, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07317877188324928, "epoch": 0.41368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.380204375430521e-06, "loss": 0.0, "num_tokens": 237297980.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07024378329515457, "epoch": 0.41376, "grad_norm": 0.0, "learning_rate": 2.3797724438762655e-06, "loss": 0.0, "step": 5172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.6171875, "completions/mean_terminated_length": 219.494384765625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06845208257436752, "epoch": 0.41384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3793404682445897e-06, "loss": 0.0, "num_tokens": 237393035.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06980462372303009, "epoch": 0.41392, "grad_norm": 0.0, "learning_rate": 2.378908448565728e-06, "loss": 0.0, "step": 5174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.59375, "completions/mean_terminated_length": 212.63999938964844, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.0659993439912796, "epoch": 0.414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.378476384869918e-06, "loss": 0.0, "num_tokens": 237488087.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 5175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06852608174085617, "epoch": 0.41408, "grad_norm": 0.0, "learning_rate": 2.3780442771873988e-06, "loss": 0.0, "step": 5176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.2265625, "completions/mean_terminated_length": 210.80821228027344, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07048850506544113, "epoch": 0.41416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3776121255484138e-06, "loss": 0.0, "num_tokens": 237583092.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06731395423412323, "epoch": 0.41424, "grad_norm": 0.0, "learning_rate": 2.377179929983209e-06, "loss": 0.0, "step": 5178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.5234375, "completions/mean_terminated_length": 232.30337524414062, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.06989548727869987, "epoch": 0.41432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.376747690522033e-06, "loss": 0.0, "num_tokens": 237679287.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06978368759155273, "epoch": 0.4144, "grad_norm": 0.0, "learning_rate": 2.376315407195138e-06, "loss": 0.0, "step": 5180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.0625, "completions/mean_terminated_length": 224.1052703857422, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07363021373748779, "epoch": 0.41448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3758830800327798e-06, "loss": 0.0, "num_tokens": 237775167.0, "reward": 0.43078044056892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.43078044056892395, "rewards/reward_fn/std": 0.9858564138412476, "step": 5181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06954453885555267, "epoch": 0.41456, "grad_norm": 0.0, "learning_rate": 2.375450709065217e-06, "loss": 0.0, "step": 5182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.0390625, "completions/mean_terminated_length": 196.2207794189453, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.08201859146356583, "epoch": 0.41464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3750182943227104e-06, "loss": 0.0, "num_tokens": 237868868.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0761185847222805, "epoch": 0.41472, "grad_norm": 0.0, "learning_rate": 2.374585835835525e-06, "loss": 0.0, "step": 5184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.546875, "completions/mean_terminated_length": 208.25286865234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.05854665860533714, "epoch": 0.4148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.374153333633927e-06, "loss": 0.0, "num_tokens": 237963018.0, "reward": 0.4806819558143616, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4806819558143616, "rewards/reward_fn/std": 0.9955083131790161, "step": 5185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060829779133200645, "epoch": 0.41488, "grad_norm": 0.0, "learning_rate": 2.3737207877481884e-06, "loss": 0.0, "step": 5186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.1953125, "completions/mean_terminated_length": 207.4264678955078, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06674877554178238, "epoch": 0.41496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3732881982085827e-06, "loss": 0.0, "num_tokens": 238058019.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06947857141494751, "epoch": 0.41504, "grad_norm": 0.0, "learning_rate": 2.372855565045386e-06, "loss": 0.0, "step": 5188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.640625, "completions/mean_terminated_length": 202.89744567871094, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07245220243930817, "epoch": 0.41512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.372422888288879e-06, "loss": 0.0, "num_tokens": 238152181.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07601245865225792, "epoch": 0.4152, "grad_norm": 0.0, "learning_rate": 2.371990167969343e-06, "loss": 0.0, "step": 5190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.9453125, "completions/mean_terminated_length": 235.14865112304688, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.07883761078119278, "epoch": 0.41528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.371557404117065e-06, "loss": 0.0, "num_tokens": 238248942.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07353180646896362, "epoch": 0.41536, "grad_norm": 0.0, "learning_rate": 2.371124596762334e-06, "loss": 0.0, "step": 5192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.1328125, "completions/mean_terminated_length": 234.00962829589844, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.07181448489427567, "epoch": 0.41544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3706917459354408e-06, "loss": 0.0, "num_tokens": 238344959.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07515174522995949, "epoch": 0.41552, "grad_norm": 0.0, "learning_rate": 2.3702588516666813e-06, "loss": 0.0, "step": 5194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.65625, "completions/mean_terminated_length": 210.72727966308594, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06467730551958084, "epoch": 0.4156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.369825913986354e-06, "loss": 0.0, "num_tokens": 238440275.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06339367479085922, "epoch": 0.41568, "grad_norm": 0.0, "learning_rate": 2.3693929329247587e-06, "loss": 0.0, "step": 5196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.2265625, "completions/mean_terminated_length": 237.3648681640625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06454312615096569, "epoch": 0.41576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3689599085122003e-06, "loss": 0.0, "num_tokens": 238537200.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06574283167719841, "epoch": 0.41584, "grad_norm": 0.0, "learning_rate": 2.368526840778986e-06, "loss": 0.0, "step": 5198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 207.27273559570312, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07140522077679634, "epoch": 0.41592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3680937297554253e-06, "loss": 0.0, "num_tokens": 238631216.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0782868042588234, "epoch": 0.416, "grad_norm": 0.0, "learning_rate": 2.3676605754718317e-06, "loss": 0.0, "step": 5200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.2734375, "completions/mean_terminated_length": 218.40660095214844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07446540519595146, "epoch": 0.41608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3672273779585225e-06, "loss": 0.0, "num_tokens": 238726099.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07548024877905846, "epoch": 0.41616, "grad_norm": 0.0, "learning_rate": 2.3667941372458155e-06, "loss": 0.0, "step": 5202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.34375, "completions/mean_terminated_length": 238.6511688232422, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.07214770466089249, "epoch": 0.41624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.366360853364033e-06, "loss": 0.0, "num_tokens": 238822911.0, "reward": 0.08830241858959198, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08830241858959198, "rewards/reward_fn/std": 0.23454421758651733, "step": 5203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07378586009144783, "epoch": 0.41632, "grad_norm": 0.0, "learning_rate": 2.365927526343501e-06, "loss": 0.0, "step": 5204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3671875, "completions/mean_terminated_length": 212.80821228027344, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.07124631479382515, "epoch": 0.4164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3654941562145475e-06, "loss": 0.0, "num_tokens": 238918062.0, "reward": 0.0074910130351781845, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0074910130351781845, "rewards/reward_fn/std": 0.019897233694791794, "step": 5205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06989338248968124, "epoch": 0.41648, "grad_norm": 0.0, "learning_rate": 2.365060743007504e-06, "loss": 0.0, "step": 5206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.7578125, "completions/mean_terminated_length": 203.2777862548828, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06520776823163033, "epoch": 0.41656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3646272867527043e-06, "loss": 0.0, "num_tokens": 239013519.0, "reward": 0.5015839338302612, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5015839338302612, "rewards/reward_fn/std": 0.9998448491096497, "step": 5207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06632759422063828, "epoch": 0.41664, "grad_norm": 0.0, "learning_rate": 2.3641937874804872e-06, "loss": 0.0, "step": 5208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.3828125, "completions/mean_terminated_length": 215.8985595703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.0711558610200882, "epoch": 0.41672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.363760245221192e-06, "loss": 0.0, "num_tokens": 239109056.0, "reward": 0.12499912083148956, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12499912083148956, "rewards/reward_fn/std": 0.33201608061790466, "step": 5209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06904115900397301, "epoch": 0.4168, "grad_norm": 0.0, "learning_rate": 2.3633266600051614e-06, "loss": 0.0, "step": 5210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.4296875, "completions/mean_terminated_length": 198.1630401611328, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07236741483211517, "epoch": 0.41688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.362893031862743e-06, "loss": 0.0, "num_tokens": 239202039.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07152168080210686, "epoch": 0.41696, "grad_norm": 0.0, "learning_rate": 2.3624593608242857e-06, "loss": 0.0, "step": 5212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 205.671875, "completions/mean_terminated_length": 195.7943878173828, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07260787487030029, "epoch": 0.41704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3620256469201417e-06, "loss": 0.0, "num_tokens": 239293901.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0716811902821064, "epoch": 0.41712, "grad_norm": 0.0, "learning_rate": 2.3615918901806666e-06, "loss": 0.0, "step": 5214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.9921875, "completions/mean_terminated_length": 215.13829040527344, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07429561391472816, "epoch": 0.4172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3611580906362193e-06, "loss": 0.0, "num_tokens": 239388364.0, "reward": 0.7624585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7624585032463074, "rewards/reward_fn/std": 1.297323226928711, "step": 5215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07312090694904327, "epoch": 0.41728, "grad_norm": 0.0, "learning_rate": 2.360724248317161e-06, "loss": 0.0, "step": 5216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.1015625, "completions/mean_terminated_length": 223.37803649902344, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07461879029870033, "epoch": 0.41736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.360290363253855e-06, "loss": 0.0, "num_tokens": 239483993.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0739203505218029, "epoch": 0.41744, "grad_norm": 0.0, "learning_rate": 2.3598564354766696e-06, "loss": 0.0, "step": 5218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 211.2105255126953, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07365912944078445, "epoch": 0.41752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.359422465015975e-06, "loss": 0.0, "num_tokens": 239578893.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07251451909542084, "epoch": 0.4176, "grad_norm": 0.0, "learning_rate": 2.3589884519021446e-06, "loss": 0.0, "step": 5220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.40625, "completions/mean_terminated_length": 214.27999877929688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07383071631193161, "epoch": 0.41768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3585543961655546e-06, "loss": 0.0, "num_tokens": 239673025.0, "reward": 0.426705539226532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.426705539226532, "rewards/reward_fn/std": 0.9858514070510864, "step": 5221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06866559758782387, "epoch": 0.41776, "grad_norm": 0.0, "learning_rate": 2.358120297836585e-06, "loss": 0.0, "step": 5222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.3046875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0745919980108738, "epoch": 0.41784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3576861569456175e-06, "loss": 0.0, "num_tokens": 239765864.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07818600162863731, "epoch": 0.41792, "grad_norm": 0.0, "learning_rate": 2.357251973523037e-06, "loss": 0.0, "step": 5224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.21875, "completions/mean_terminated_length": 211.4805145263672, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07076608017086983, "epoch": 0.418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.356817747599232e-06, "loss": 0.0, "num_tokens": 239860740.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06944279000163078, "epoch": 0.41808, "grad_norm": 0.0, "learning_rate": 2.3563834792045944e-06, "loss": 0.0, "step": 5226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.1796875, "completions/mean_terminated_length": 215.01112365722656, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07517309486865997, "epoch": 0.41816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.355949168369517e-06, "loss": 0.0, "num_tokens": 239955355.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07753702998161316, "epoch": 0.41824, "grad_norm": 0.0, "learning_rate": 2.3555148151243984e-06, "loss": 0.0, "step": 5228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 237.39535522460938, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07331069186329842, "epoch": 0.41832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3550804194996384e-06, "loss": 0.0, "num_tokens": 240052859.0, "reward": 0.5983732342720032, "reward_std": 0.0, "rewards/reward_fn/mean": 0.5983732342720032, "rewards/reward_fn/std": 0.9875431060791016, "step": 5229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07349252700805664, "epoch": 0.4184, "grad_norm": 0.0, "learning_rate": 2.3546459815256395e-06, "loss": 0.0, "step": 5230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 215.546875, "completions/mean_terminated_length": 201.4947509765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06125768832862377, "epoch": 0.41848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3542115012328082e-06, "loss": 0.0, "num_tokens": 240145985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06198981590569019, "epoch": 0.41856, "grad_norm": 0.0, "learning_rate": 2.3537769786515538e-06, "loss": 0.0, "step": 5232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.28125, "completions/mean_terminated_length": 211.82022094726562, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.0838247686624527, "epoch": 0.41864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3533424138122876e-06, "loss": 0.0, "num_tokens": 240240357.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 5233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08408086374402046, "epoch": 0.41872, "grad_norm": 0.0, "learning_rate": 2.352907806745425e-06, "loss": 0.0, "step": 5234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.859375, "completions/mean_terminated_length": 238.34091186523438, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.07485395669937134, "epoch": 0.4188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.352473157481384e-06, "loss": 0.0, "num_tokens": 240337107.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07465484365820885, "epoch": 0.41888, "grad_norm": 0.0, "learning_rate": 2.3520384660505857e-06, "loss": 0.0, "step": 5236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.828125, "completions/mean_terminated_length": 242.75999450683594, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.06642695516347885, "epoch": 0.41896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3516037324834526e-06, "loss": 0.0, "num_tokens": 240434749.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06692181527614594, "epoch": 0.41904, "grad_norm": 0.0, "learning_rate": 2.3511689568104133e-06, "loss": 0.0, "step": 5238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.2578125, "completions/mean_terminated_length": 223.63954162597656, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07012872025370598, "epoch": 0.41912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3507341390618957e-06, "loss": 0.0, "num_tokens": 240530270.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06728983297944069, "epoch": 0.4192, "grad_norm": 0.0, "learning_rate": 2.3502992792683335e-06, "loss": 0.0, "step": 5240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.6640625, "completions/mean_terminated_length": 238.59210205078125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06943856179714203, "epoch": 0.41928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.349864377460162e-06, "loss": 0.0, "num_tokens": 240627251.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0665985606610775, "epoch": 0.41936, "grad_norm": 0.0, "learning_rate": 2.34942943366782e-06, "loss": 0.0, "step": 5242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.7109375, "completions/mean_terminated_length": 232.84811401367188, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.06744945794343948, "epoch": 0.41944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3489944479217495e-06, "loss": 0.0, "num_tokens": 240723726.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06593504175543785, "epoch": 0.41952, "grad_norm": 0.0, "learning_rate": 2.3485594202523942e-06, "loss": 0.0, "step": 5244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.3359375, "completions/mean_terminated_length": 213.33734130859375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06507443636655807, "epoch": 0.4196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.348124350690201e-06, "loss": 0.0, "num_tokens": 240818489.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06315275654196739, "epoch": 0.41968, "grad_norm": 0.0, "learning_rate": 2.3476892392656202e-06, "loss": 0.0, "step": 5246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.5625, "completions/mean_terminated_length": 235.75758361816406, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06862398236989975, "epoch": 0.41976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3472540860091057e-06, "loss": 0.0, "num_tokens": 240915457.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07046732679009438, "epoch": 0.41984, "grad_norm": 0.0, "learning_rate": 2.3468188909511132e-06, "loss": 0.0, "step": 5248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.140625, "completions/mean_terminated_length": 209.2058868408203, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.08489293977618217, "epoch": 0.41992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.346383654122103e-06, "loss": 0.0, "num_tokens": 241010579.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 5249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08039823919534683, "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 2.345948375552535e-06, "loss": 0.0, "step": 5250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.5703125, "completions/mean_terminated_length": 241.09877014160156, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.0694403164088726, "epoch": 0.42008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3455130552728753e-06, "loss": 0.0, "num_tokens": 241107676.0, "reward": 0.01492841262370348, "reward_std": 0.0, "rewards/reward_fn/mean": 0.01492841262370348, "rewards/reward_fn/std": 0.039652060717344284, "step": 5251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06821493059396744, "epoch": 0.42016, "grad_norm": 0.0, "learning_rate": 2.3450776933135916e-06, "loss": 0.0, "step": 5252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.7109375, "completions/mean_terminated_length": 219.23158264160156, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0635683611035347, "epoch": 0.42024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.344642289705155e-06, "loss": 0.0, "num_tokens": 241202487.0, "reward": 0.7549973130226135, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7549973130226135, "rewards/reward_fn/std": 1.3013103008270264, "step": 5253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06017874553799629, "epoch": 0.42032, "grad_norm": 0.0, "learning_rate": 2.344206844478038e-06, "loss": 0.0, "step": 5254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.6796875, "completions/mean_terminated_length": 223.23158264160156, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07119014486670494, "epoch": 0.4204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3437713576627182e-06, "loss": 0.0, "num_tokens": 241297678.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.073277048766613, "epoch": 0.42048, "grad_norm": 0.0, "learning_rate": 2.343335829289675e-06, "loss": 0.0, "step": 5256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.2265625, "completions/mean_terminated_length": 223.3366241455078, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.07297490909695625, "epoch": 0.42056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.34290025938939e-06, "loss": 0.0, "num_tokens": 241392683.0, "reward": 1.540934681892395, "reward_std": 0.0, "rewards/reward_fn/mean": 1.540934681892395, "rewards/reward_fn/std": 1.4682531356811523, "step": 5257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07517783716320992, "epoch": 0.42064, "grad_norm": 0.0, "learning_rate": 2.3424646479923497e-06, "loss": 0.0, "step": 5258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.9453125, "completions/mean_terminated_length": 238.52542114257812, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06924201175570488, "epoch": 0.42072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.342028995129041e-06, "loss": 0.0, "num_tokens": 241489956.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07016229256987572, "epoch": 0.4208, "grad_norm": 0.0, "learning_rate": 2.341593300829956e-06, "loss": 0.0, "step": 5260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.5546875, "completions/mean_terminated_length": 211.62037658691406, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06937442719936371, "epoch": 0.42088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.341157565125588e-06, "loss": 0.0, "num_tokens": 241583467.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143720984458923, "epoch": 0.42096, "grad_norm": 0.0, "learning_rate": 2.3407217880464353e-06, "loss": 0.0, "step": 5262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.8984375, "completions/mean_terminated_length": 214.39706420898438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06026245094835758, "epoch": 0.42104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3402859696229962e-06, "loss": 0.0, "num_tokens": 241678942.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061368269845843315, "epoch": 0.42112, "grad_norm": 0.0, "learning_rate": 2.3398501098857733e-06, "loss": 0.0, "step": 5264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.34375, "completions/mean_terminated_length": 199.71133422851562, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07371180132031441, "epoch": 0.4212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3394142088652732e-06, "loss": 0.0, "num_tokens": 241771786.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07389563322067261, "epoch": 0.42128, "grad_norm": 0.0, "learning_rate": 2.338978266592003e-06, "loss": 0.0, "step": 5266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.734375, "completions/mean_terminated_length": 222.60000610351562, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.07528432458639145, "epoch": 0.42136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3385422830964753e-06, "loss": 0.0, "num_tokens": 241867752.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07478532940149307, "epoch": 0.42144, "grad_norm": 0.0, "learning_rate": 2.3381062584092047e-06, "loss": 0.0, "step": 5268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 195.7265625, "completions/mean_terminated_length": 181.0970916748047, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.07725677639245987, "epoch": 0.42152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3376701925607074e-06, "loss": 0.0, "num_tokens": 241958341.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07193390280008316, "epoch": 0.4216, "grad_norm": 0.0, "learning_rate": 2.337234085581503e-06, "loss": 0.0, "step": 5270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.4921875, "completions/mean_terminated_length": 228.29808044433594, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.061741506680846214, "epoch": 0.42168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.336797937502115e-06, "loss": 0.0, "num_tokens": 242053764.0, "reward": 0.022260108962655067, "reward_std": 0.0, "rewards/reward_fn/mean": 0.022260108962655067, "rewards/reward_fn/std": 0.059126123785972595, "step": 5271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06672011315822601, "epoch": 0.42176, "grad_norm": 0.0, "learning_rate": 2.3363617483530695e-06, "loss": 0.0, "step": 5272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.3828125, "completions/mean_terminated_length": 228.10000610351562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06600555777549744, "epoch": 0.42184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.335925518164895e-06, "loss": 0.0, "num_tokens": 242149557.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06388440355658531, "epoch": 0.42192, "grad_norm": 0.0, "learning_rate": 2.3354892469681223e-06, "loss": 0.0, "step": 5274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.4140625, "completions/mean_terminated_length": 201.6476287841797, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 0.07175568491220474, "epoch": 0.422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.335052934793286e-06, "loss": 0.0, "num_tokens": 242242154.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07244822382926941, "epoch": 0.42208, "grad_norm": 0.0, "learning_rate": 2.3346165816709246e-06, "loss": 0.0, "step": 5276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.3046875, "completions/mean_terminated_length": 222.23809814453125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07090487331151962, "epoch": 0.42216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3341801876315764e-06, "loss": 0.0, "num_tokens": 242336913.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06948929652571678, "epoch": 0.42224, "grad_norm": 0.0, "learning_rate": 2.3337437527057856e-06, "loss": 0.0, "step": 5278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 209.2421875, "completions/mean_terminated_length": 190.23077392578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06922392547130585, "epoch": 0.42232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3333072769240967e-06, "loss": 0.0, "num_tokens": 242429232.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06842698901891708, "epoch": 0.4224, "grad_norm": 0.0, "learning_rate": 2.3328707603170605e-06, "loss": 0.0, "step": 5280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.359375, "completions/mean_terminated_length": 234.7105255126953, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.06779127195477486, "epoch": 0.42248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.332434202915227e-06, "loss": 0.0, "num_tokens": 242525918.0, "reward": 0.30578044056892395, "reward_std": 0.0, "rewards/reward_fn/mean": 0.30578044056892395, "rewards/reward_fn/std": 0.6593836545944214, "step": 5281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06551835313439369, "epoch": 0.42256, "grad_norm": 0.0, "learning_rate": 2.3319976047491503e-06, "loss": 0.0, "step": 5282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.1953125, "completions/mean_terminated_length": 210.77569580078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06660113856196404, "epoch": 0.42264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3315609658493894e-06, "loss": 0.0, "num_tokens": 242619383.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06825561821460724, "epoch": 0.42272, "grad_norm": 0.0, "learning_rate": 2.331124286246502e-06, "loss": 0.0, "step": 5284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 223.0724639892578, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06387179717421532, "epoch": 0.4228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3306875659710535e-06, "loss": 0.0, "num_tokens": 242715415.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06753767281770706, "epoch": 0.42288, "grad_norm": 0.0, "learning_rate": 2.3302508050536084e-06, "loss": 0.0, "step": 5286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.8515625, "completions/mean_terminated_length": 231.8303680419922, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.07106943055987358, "epoch": 0.42296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.329814003524735e-06, "loss": 0.0, "num_tokens": 242811012.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07091990485787392, "epoch": 0.42304, "grad_norm": 0.0, "learning_rate": 2.329377161415006e-06, "loss": 0.0, "step": 5288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.046875, "completions/mean_terminated_length": 223.07215881347656, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07013865932822227, "epoch": 0.42312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3289402787549944e-06, "loss": 0.0, "num_tokens": 242906122.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07161591574549675, "epoch": 0.4232, "grad_norm": 0.0, "learning_rate": 2.3285033555752788e-06, "loss": 0.0, "step": 5290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.734375, "completions/mean_terminated_length": 208.39474487304688, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06587136536836624, "epoch": 0.42328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3280663919064383e-06, "loss": 0.0, "num_tokens": 243000808.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06895967200398445, "epoch": 0.42336, "grad_norm": 0.0, "learning_rate": 2.3276293877790555e-06, "loss": 0.0, "step": 5292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.0390625, "completions/mean_terminated_length": 225.8539276123047, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06661925464868546, "epoch": 0.42344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.327192343223717e-06, "loss": 0.0, "num_tokens": 243096429.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0648123100399971, "epoch": 0.42352, "grad_norm": 0.0, "learning_rate": 2.3267552582710103e-06, "loss": 0.0, "step": 5294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.3828125, "completions/mean_terminated_length": 224.43751525878906, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07488948106765747, "epoch": 0.4236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.326318132951527e-06, "loss": 0.0, "num_tokens": 243191198.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08391440659761429, "epoch": 0.42368, "grad_norm": 0.0, "learning_rate": 2.3258809672958624e-06, "loss": 0.0, "step": 5296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.59375, "completions/mean_terminated_length": 228.61111450195312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06315464340150356, "epoch": 0.42376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.325443761334612e-06, "loss": 0.0, "num_tokens": 243287530.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06309897638857365, "epoch": 0.42384, "grad_norm": 0.0, "learning_rate": 2.325006515098376e-06, "loss": 0.0, "step": 5298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.7109375, "completions/mean_terminated_length": 220.9479217529297, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0654376819729805, "epoch": 0.42392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3245692286177576e-06, "loss": 0.0, "num_tokens": 243382469.0, "reward": 0.8039442300796509, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8039442300796509, "rewards/reward_fn/std": 1.276266098022461, "step": 5299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06623052060604095, "epoch": 0.424, "grad_norm": 0.0, "learning_rate": 2.3241319019233615e-06, "loss": 0.0, "step": 5300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.171875, "completions/mean_terminated_length": 215.33334350585938, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06810420751571655, "epoch": 0.42408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3236945350457957e-06, "loss": 0.0, "num_tokens": 243477723.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07296685501933098, "epoch": 0.42416, "grad_norm": 0.0, "learning_rate": 2.3232571280156724e-06, "loss": 0.0, "step": 5302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 239.04949951171875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06975899636745453, "epoch": 0.42424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.322819680863605e-06, "loss": 0.0, "num_tokens": 243574315.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06932535767555237, "epoch": 0.42432, "grad_norm": 0.0, "learning_rate": 2.3223821936202103e-06, "loss": 0.0, "step": 5304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.2578125, "completions/mean_terminated_length": 237.46591186523438, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.07818233221769333, "epoch": 0.4244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.321944666316107e-06, "loss": 0.0, "num_tokens": 243670988.0, "reward": 0.04315175488591194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04315175488591194, "rewards/reward_fn/std": 0.11461740732192993, "step": 5305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0744171068072319, "epoch": 0.42448, "grad_norm": 0.0, "learning_rate": 2.3215070989819177e-06, "loss": 0.0, "step": 5306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.2265625, "completions/mean_terminated_length": 217.65151977539062, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.07398400828242302, "epoch": 0.42456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.321069491648268e-06, "loss": 0.0, "num_tokens": 243766761.0, "reward": 0.4806819558143616, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4806819558143616, "rewards/reward_fn/std": 0.9955083131790161, "step": 5307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0706057921051979, "epoch": 0.42464, "grad_norm": 0.0, "learning_rate": 2.3206318443457857e-06, "loss": 0.0, "step": 5308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.4453125, "completions/mean_terminated_length": 221.6724090576172, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.0733211375772953, "epoch": 0.42472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3201941571051012e-06, "loss": 0.0, "num_tokens": 243863074.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 5309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06874364987015724, "epoch": 0.4248, "grad_norm": 0.0, "learning_rate": 2.3197564299568487e-06, "loss": 0.0, "step": 5310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 230.11236572265625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.06923165172338486, "epoch": 0.42488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.319318662931663e-06, "loss": 0.0, "num_tokens": 243959074.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 5311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06587618216872215, "epoch": 0.42496, "grad_norm": 0.0, "learning_rate": 2.318880856060184e-06, "loss": 0.0, "step": 5312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.2578125, "completions/mean_terminated_length": 207.78872680664062, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0792437344789505, "epoch": 0.42504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.318443009373054e-06, "loss": 0.0, "num_tokens": 244053955.0, "reward": 0.12208539247512817, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12208539247512817, "rewards/reward_fn/std": 0.32427677512168884, "step": 5313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07394345849752426, "epoch": 0.42512, "grad_norm": 0.0, "learning_rate": 2.3180051229009168e-06, "loss": 0.0, "step": 5314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.1015625, "completions/mean_terminated_length": 208.76998901367188, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06326854787766933, "epoch": 0.4252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3175671966744213e-06, "loss": 0.0, "num_tokens": 244147536.0, "reward": 0.11835075169801712, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11835075169801712, "rewards/reward_fn/std": 0.31435704231262207, "step": 5315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06219361908733845, "epoch": 0.42528, "grad_norm": 0.0, "learning_rate": 2.317129230724216e-06, "loss": 0.0, "step": 5316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.4609375, "completions/mean_terminated_length": 224.2967071533203, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07381409406661987, "epoch": 0.42536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3166912250809548e-06, "loss": 0.0, "num_tokens": 244242955.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 5317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07350113615393639, "epoch": 0.42544, "grad_norm": 0.0, "learning_rate": 2.3162531797752925e-06, "loss": 0.0, "step": 5318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.4921875, "completions/mean_terminated_length": 216.2911376953125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.0686182901263237, "epoch": 0.42552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.315815094837889e-06, "loss": 0.0, "num_tokens": 244338122.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0718287006020546, "epoch": 0.4256, "grad_norm": 0.0, "learning_rate": 2.3153769702994045e-06, "loss": 0.0, "step": 5320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 233.19403076171875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06489988788962364, "epoch": 0.42568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.314938806190504e-06, "loss": 0.0, "num_tokens": 244434898.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0652560368180275, "epoch": 0.42576, "grad_norm": 0.0, "learning_rate": 2.314500602541854e-06, "loss": 0.0, "step": 5322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 215.7894744873047, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.05682010389864445, "epoch": 0.42584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3140623593841243e-06, "loss": 0.0, "num_tokens": 244530146.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057065270841121674, "epoch": 0.42592, "grad_norm": 0.0, "learning_rate": 2.3136240767479867e-06, "loss": 0.0, "step": 5324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.546875, "completions/mean_terminated_length": 205.9666748046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06421176716685295, "epoch": 0.426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.313185754664117e-06, "loss": 0.0, "num_tokens": 244625448.0, "reward": 0.055780451744794846, "reward_std": 0.0, "rewards/reward_fn/mean": 0.055780451744794846, "rewards/reward_fn/std": 0.14816109836101532, "step": 5325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06870673224329948, "epoch": 0.42608, "grad_norm": 0.0, "learning_rate": 2.312747393163192e-06, "loss": 0.0, "step": 5326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.78125, "completions/mean_terminated_length": 208.3913116455078, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06351395137608051, "epoch": 0.42616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3123089922758938e-06, "loss": 0.0, "num_tokens": 244719372.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06499065831303596, "epoch": 0.42624, "grad_norm": 0.0, "learning_rate": 2.3118705520329053e-06, "loss": 0.0, "step": 5328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.5078125, "completions/mean_terminated_length": 228.58441162109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06794979050755501, "epoch": 0.42632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.311432072464913e-06, "loss": 0.0, "num_tokens": 244815565.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07097979635000229, "epoch": 0.4264, "grad_norm": 0.0, "learning_rate": 2.3109935536026053e-06, "loss": 0.0, "step": 5330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.1484375, "completions/mean_terminated_length": 238.83334350585938, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.07062463834881783, "epoch": 0.42648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.310554995476673e-06, "loss": 0.0, "num_tokens": 244912736.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07010534778237343, "epoch": 0.42656, "grad_norm": 0.0, "learning_rate": 2.3101163981178126e-06, "loss": 0.0, "step": 5332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 176.546875, "completions/mean_terminated_length": 157.2621307373047, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07740472257137299, "epoch": 0.42664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3096777615567202e-06, "loss": 0.0, "num_tokens": 245000870.0, "reward": 1.2268017530441284, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2268017530441284, "rewards/reward_fn/std": 1.4029408693313599, "step": 5333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07879650965332985, "epoch": 0.42672, "grad_norm": 0.0, "learning_rate": 2.3092390858240958e-06, "loss": 0.0, "step": 5334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.03125, "completions/mean_terminated_length": 232.0500030517578, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06626297906041145, "epoch": 0.4268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.308800370950642e-06, "loss": 0.0, "num_tokens": 245097258.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0637221522629261, "epoch": 0.42688, "grad_norm": 0.0, "learning_rate": 2.308361616967064e-06, "loss": 0.0, "step": 5336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 250.3984375, "completions/mean_terminated_length": 241.0625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06477456539869308, "epoch": 0.42696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3079228239040707e-06, "loss": 0.0, "num_tokens": 245194845.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06837230548262596, "epoch": 0.42704, "grad_norm": 0.0, "learning_rate": 2.307483991792372e-06, "loss": 0.0, "step": 5338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 216.88888549804688, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06777708977460861, "epoch": 0.42712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3070451206626825e-06, "loss": 0.0, "num_tokens": 245290333.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06592106074094772, "epoch": 0.4272, "grad_norm": 0.0, "learning_rate": 2.306606210545718e-06, "loss": 0.0, "step": 5340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.734375, "completions/mean_terminated_length": 226.73118591308594, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07057582587003708, "epoch": 0.42728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3061672614721973e-06, "loss": 0.0, "num_tokens": 245385915.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07209181785583496, "epoch": 0.42736, "grad_norm": 0.0, "learning_rate": 2.3057282734728428e-06, "loss": 0.0, "step": 5342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.109375, "completions/mean_terminated_length": 245.02174377441406, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06636783480644226, "epoch": 0.42744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3052892465783786e-06, "loss": 0.0, "num_tokens": 245483209.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06462526693940163, "epoch": 0.42752, "grad_norm": 0.0, "learning_rate": 2.3048501808195326e-06, "loss": 0.0, "step": 5344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.8671875, "completions/mean_terminated_length": 209.02149963378906, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.062229517847299576, "epoch": 0.4276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3044110762270335e-06, "loss": 0.0, "num_tokens": 245577144.0, "reward": 1.2276300191879272, "reward_std": 0.0, "rewards/reward_fn/mean": 1.2276300191879272, "rewards/reward_fn/std": 1.402697205543518, "step": 5345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06423754245042801, "epoch": 0.42768, "grad_norm": 0.0, "learning_rate": 2.3039719328316154e-06, "loss": 0.0, "step": 5346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.1953125, "completions/mean_terminated_length": 212.0128173828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.08068554103374481, "epoch": 0.42776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.303532750664013e-06, "loss": 0.0, "num_tokens": 245672017.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0729227215051651, "epoch": 0.42784, "grad_norm": 0.0, "learning_rate": 2.303093529754964e-06, "loss": 0.0, "step": 5348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.59375, "completions/mean_terminated_length": 221.4042510986328, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07707605510950089, "epoch": 0.42792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3026542701352097e-06, "loss": 0.0, "num_tokens": 245767069.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07690716534852982, "epoch": 0.428, "grad_norm": 0.0, "learning_rate": 2.302214971835494e-06, "loss": 0.0, "step": 5350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.046875, "completions/mean_terminated_length": 205.1807098388672, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07286088541150093, "epoch": 0.42808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3017756348865628e-06, "loss": 0.0, "num_tokens": 245861155.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06787936761975288, "epoch": 0.42816, "grad_norm": 0.0, "learning_rate": 2.301336259319165e-06, "loss": 0.0, "step": 5352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.03125, "completions/mean_terminated_length": 199.1555633544922, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.08179143071174622, "epoch": 0.42824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.300896845164052e-06, "loss": 0.0, "num_tokens": 245954343.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08695309236645699, "epoch": 0.42832, "grad_norm": 0.0, "learning_rate": 2.3004573924519783e-06, "loss": 0.0, "step": 5354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.0625, "completions/mean_terminated_length": 240.42105102539062, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07684562355279922, "epoch": 0.4284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3000179012137014e-06, "loss": 0.0, "num_tokens": 246051759.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0794275589287281, "epoch": 0.42848, "grad_norm": 0.0, "learning_rate": 2.29957837147998e-06, "loss": 0.0, "step": 5356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.40625, "completions/mean_terminated_length": 207.43923950195312, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0746622309088707, "epoch": 0.42856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.299138803281578e-06, "loss": 0.0, "num_tokens": 246144867.0, "reward": 0.41815176606178284, "reward_std": 0.0, "rewards/reward_fn/mean": 0.41815176606178284, "rewards/reward_fn/std": 0.9862273335456848, "step": 5357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06815417483448982, "epoch": 0.42864, "grad_norm": 0.0, "learning_rate": 2.29869919664926e-06, "loss": 0.0, "step": 5358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.0390625, "completions/mean_terminated_length": 231.32955932617188, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07005944103002548, "epoch": 0.42872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.298259551613793e-06, "loss": 0.0, "num_tokens": 246241000.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0685114674270153, "epoch": 0.4288, "grad_norm": 0.0, "learning_rate": 2.297819868205948e-06, "loss": 0.0, "step": 5360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.328125, "completions/mean_terminated_length": 209.55882263183594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06612354144454002, "epoch": 0.42888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2973801464564987e-06, "loss": 0.0, "num_tokens": 246336146.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06677861511707306, "epoch": 0.42896, "grad_norm": 0.0, "learning_rate": 2.29694038639622e-06, "loss": 0.0, "step": 5362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1171875, "completions/mean_terminated_length": 233.2526397705078, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06725037097930908, "epoch": 0.42904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.296500588055892e-06, "loss": 0.0, "num_tokens": 246432289.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06976617127656937, "epoch": 0.42912, "grad_norm": 0.0, "learning_rate": 2.2960607514662946e-06, "loss": 0.0, "step": 5364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 233.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07201723009347916, "epoch": 0.4292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.295620876658212e-06, "loss": 0.0, "num_tokens": 246529169.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07327869534492493, "epoch": 0.42928, "grad_norm": 0.0, "learning_rate": 2.2951809636624303e-06, "loss": 0.0, "step": 5366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.3828125, "completions/mean_terminated_length": 235.73255920410156, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06163582392036915, "epoch": 0.42936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.29474101250974e-06, "loss": 0.0, "num_tokens": 246625730.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0645727664232254, "epoch": 0.42944, "grad_norm": 0.0, "learning_rate": 2.294301023230932e-06, "loss": 0.0, "step": 5368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.828125, "completions/mean_terminated_length": 200.739990234375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.05887946859002113, "epoch": 0.42952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.293860995856802e-06, "loss": 0.0, "num_tokens": 246718508.0, "reward": 0.46690279245376587, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46690279245376587, "rewards/reward_fn/std": 0.9910825490951538, "step": 5369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06348223239183426, "epoch": 0.4296, "grad_norm": 0.0, "learning_rate": 2.2934209304181457e-06, "loss": 0.0, "step": 5370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.84375, "completions/mean_terminated_length": 222.8514862060547, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07070646435022354, "epoch": 0.42968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.292980826945765e-06, "loss": 0.0, "num_tokens": 246813464.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06694794073700905, "epoch": 0.42976, "grad_norm": 0.0, "learning_rate": 2.2925406854704596e-06, "loss": 0.0, "step": 5372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 233.04916381835938, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06684572994709015, "epoch": 0.42984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2921005060230376e-06, "loss": 0.0, "num_tokens": 246910368.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06536418944597244, "epoch": 0.42992, "grad_norm": 0.0, "learning_rate": 2.291660288634306e-06, "loss": 0.0, "step": 5374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.4375, "completions/mean_terminated_length": 219.44444274902344, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.0747949555516243, "epoch": 0.43, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.291220033335075e-06, "loss": 0.0, "num_tokens": 247006040.0, "reward": 0.4224936366081238, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4224936366081238, "rewards/reward_fn/std": 0.9859711527824402, "step": 5375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06768450886011124, "epoch": 0.43008, "grad_norm": 0.0, "learning_rate": 2.290779740156158e-06, "loss": 0.0, "step": 5376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.9453125, "completions/mean_terminated_length": 219.34524536132812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.06549450010061264, "epoch": 0.43016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2903394091283713e-06, "loss": 0.0, "num_tokens": 247101265.0, "reward": 0.45143958926200867, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45143958926200867, "rewards/reward_fn/std": 0.9861985445022583, "step": 5377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06621356680989265, "epoch": 0.43024, "grad_norm": 0.0, "learning_rate": 2.2898990402825327e-06, "loss": 0.0, "step": 5378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.3828125, "completions/mean_terminated_length": 234.17567443847656, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.07154519855976105, "epoch": 0.43032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2894586336494634e-06, "loss": 0.0, "num_tokens": 247197954.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0714956596493721, "epoch": 0.4304, "grad_norm": 0.0, "learning_rate": 2.2890181892599873e-06, "loss": 0.0, "step": 5380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.4765625, "completions/mean_terminated_length": 194.1976776123047, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.0727175772190094, "epoch": 0.43048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2885777071449317e-06, "loss": 0.0, "num_tokens": 247290943.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06969426572322845, "epoch": 0.43056, "grad_norm": 0.0, "learning_rate": 2.288137187335125e-06, "loss": 0.0, "step": 5382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.2578125, "completions/mean_terminated_length": 235.8870849609375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.07088151946663857, "epoch": 0.43064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2876966298613987e-06, "loss": 0.0, "num_tokens": 247388000.0, "reward": 0.06162349507212639, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06162349507212639, "rewards/reward_fn/std": 0.16368108987808228, "step": 5383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06830273196101189, "epoch": 0.43072, "grad_norm": 0.0, "learning_rate": 2.287256034754588e-06, "loss": 0.0, "step": 5384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.4765625, "completions/mean_terminated_length": 235.27691650390625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.07212593778967857, "epoch": 0.4308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2868154020455285e-06, "loss": 0.0, "num_tokens": 247484957.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07390762120485306, "epoch": 0.43088, "grad_norm": 0.0, "learning_rate": 2.286374731765061e-06, "loss": 0.0, "step": 5386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.78125, "completions/mean_terminated_length": 208.05128479003906, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.08061892166733742, "epoch": 0.43096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.285934023944028e-06, "loss": 0.0, "num_tokens": 247579521.0, "reward": 0.1690801978111267, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1690801978111267, "rewards/reward_fn/std": 0.3295021057128906, "step": 5387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08267820626497269, "epoch": 0.43104, "grad_norm": 0.0, "learning_rate": 2.2854932786132734e-06, "loss": 0.0, "step": 5388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.984375, "completions/mean_terminated_length": 209.56044006347656, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06371119432151318, "epoch": 0.43112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2850524958036455e-06, "loss": 0.0, "num_tokens": 247673599.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06632687151432037, "epoch": 0.4312, "grad_norm": 0.0, "learning_rate": 2.284611675545994e-06, "loss": 0.0, "step": 5390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.515625, "completions/mean_terminated_length": 207.86666870117188, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06982553750276566, "epoch": 0.43128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2841708178711713e-06, "loss": 0.0, "num_tokens": 247766849.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07169649749994278, "epoch": 0.43136, "grad_norm": 0.0, "learning_rate": 2.2837299228100334e-06, "loss": 0.0, "step": 5392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.1640625, "completions/mean_terminated_length": 241.98989868164062, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.07075995951890945, "epoch": 0.43144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2832889903934382e-06, "loss": 0.0, "num_tokens": 247863766.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06586574018001556, "epoch": 0.43152, "grad_norm": 0.0, "learning_rate": 2.2828480206522472e-06, "loss": 0.0, "step": 5394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.6953125, "completions/mean_terminated_length": 235.6555633544922, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.06461135670542717, "epoch": 0.4316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2824070136173217e-06, "loss": 0.0, "num_tokens": 247960239.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06455898657441139, "epoch": 0.43168, "grad_norm": 0.0, "learning_rate": 2.2819659693195284e-06, "loss": 0.0, "step": 5396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 234.795166015625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.07223686948418617, "epoch": 0.43176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.281524887789737e-06, "loss": 0.0, "num_tokens": 248056783.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07340234518051147, "epoch": 0.43184, "grad_norm": 0.0, "learning_rate": 2.2810837690588162e-06, "loss": 0.0, "step": 5398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.5703125, "completions/mean_terminated_length": 237.19717407226562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06519630551338196, "epoch": 0.43192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.280642613157641e-06, "loss": 0.0, "num_tokens": 248153752.0, "reward": 0.03411313518881798, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03411313518881798, "rewards/reward_fn/std": 0.09060950577259064, "step": 5399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06548605859279633, "epoch": 0.432, "grad_norm": 0.0, "learning_rate": 2.2802014201170886e-06, "loss": 0.0, "step": 5400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.1875, "completions/mean_terminated_length": 208.6371612548828, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.06703458353877068, "epoch": 0.43208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2797601899680355e-06, "loss": 0.0, "num_tokens": 248246704.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06716670095920563, "epoch": 0.43216, "grad_norm": 0.0, "learning_rate": 2.279318922741365e-06, "loss": 0.0, "step": 5402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.6953125, "completions/mean_terminated_length": 243.95521545410156, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06509961187839508, "epoch": 0.43224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2788776184679605e-06, "loss": 0.0, "num_tokens": 248344201.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 5403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0663159191608429, "epoch": 0.43232, "grad_norm": 0.0, "learning_rate": 2.278436277178709e-06, "loss": 0.0, "step": 5404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.1015625, "completions/mean_terminated_length": 238.5689697265625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.07580886781215668, "epoch": 0.4324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.277994898904499e-06, "loss": 0.0, "num_tokens": 248441494.0, "reward": 0.06533318012952805, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06533318012952805, "rewards/reward_fn/std": 0.1735345423221588, "step": 5405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07131427526473999, "epoch": 0.43248, "grad_norm": 0.0, "learning_rate": 2.2775534836762228e-06, "loss": 0.0, "step": 5406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.1484375, "completions/mean_terminated_length": 199.34568786621094, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07009386271238327, "epoch": 0.43256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.277112031524775e-06, "loss": 0.0, "num_tokens": 248535209.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06923504173755646, "epoch": 0.43264, "grad_norm": 0.0, "learning_rate": 2.2766705424810523e-06, "loss": 0.0, "step": 5408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.4453125, "completions/mean_terminated_length": 235.3000030517578, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06878890097141266, "epoch": 0.43272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.276229016575954e-06, "loss": 0.0, "num_tokens": 248631650.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06849773600697517, "epoch": 0.4328, "grad_norm": 0.0, "learning_rate": 2.275787453840383e-06, "loss": 0.0, "step": 5410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.40625, "completions/mean_terminated_length": 202.75999450683594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.07067575678229332, "epoch": 0.43288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2753458543052433e-06, "loss": 0.0, "num_tokens": 248724630.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06882335245609283, "epoch": 0.43296, "grad_norm": 0.0, "learning_rate": 2.274904218001443e-06, "loss": 0.0, "step": 5412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 210.9629669189453, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06803910061717033, "epoch": 0.43304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.274462544959891e-06, "loss": 0.0, "num_tokens": 248819286.0, "reward": 0.4091131389141083, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4091131389141083, "rewards/reward_fn/std": 0.9871928691864014, "step": 5413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07191339880228043, "epoch": 0.43312, "grad_norm": 0.0, "learning_rate": 2.2740208352115004e-06, "loss": 0.0, "step": 5414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.3203125, "completions/mean_terminated_length": 233.83544921875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06917822360992432, "epoch": 0.4332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2735790887871855e-06, "loss": 0.0, "num_tokens": 248915839.0, "reward": 0.4965865910053253, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4965865910053253, "rewards/reward_fn/std": 1.0022554397583008, "step": 5415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07226638868451118, "epoch": 0.43328, "grad_norm": 0.0, "learning_rate": 2.2731373057178647e-06, "loss": 0.0, "step": 5416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.2421875, "completions/mean_terminated_length": 193.09194946289062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0709972158074379, "epoch": 0.43336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2726954860344588e-06, "loss": 0.0, "num_tokens": 249008670.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0724700428545475, "epoch": 0.43344, "grad_norm": 0.0, "learning_rate": 2.272253629767889e-06, "loss": 0.0, "step": 5418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.6640625, "completions/mean_terminated_length": 232.1192626953125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.06588584557175636, "epoch": 0.43352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2718117369490807e-06, "loss": 0.0, "num_tokens": 249104371.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06505630165338516, "epoch": 0.4336, "grad_norm": 0.0, "learning_rate": 2.2713698076089623e-06, "loss": 0.0, "step": 5420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 216.484375, "completions/mean_terminated_length": 195.0602264404297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0692085288465023, "epoch": 0.43368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.270927841778464e-06, "loss": 0.0, "num_tokens": 249197617.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06950974836945534, "epoch": 0.43376, "grad_norm": 0.0, "learning_rate": 2.270485839488519e-06, "loss": 0.0, "step": 5422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.6015625, "completions/mean_terminated_length": 229.53932189941406, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.07182330638170242, "epoch": 0.43384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2700438007700634e-06, "loss": 0.0, "num_tokens": 249293566.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0676824152469635, "epoch": 0.43392, "grad_norm": 0.0, "learning_rate": 2.2696017256540336e-06, "loss": 0.0, "step": 5424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.9296875, "completions/mean_terminated_length": 237.4329833984375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.07043461129069328, "epoch": 0.434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2691596141713713e-06, "loss": 0.0, "num_tokens": 249390069.0, "reward": 0.39967191219329834, "reward_std": 0.0, "rewards/reward_fn/mean": 0.39967191219329834, "rewards/reward_fn/std": 0.988822877407074, "step": 5425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06561531126499176, "epoch": 0.43408, "grad_norm": 0.0, "learning_rate": 2.268717466353019e-06, "loss": 0.0, "step": 5426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.3125, "completions/mean_terminated_length": 238.4615478515625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06534554064273834, "epoch": 0.43416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2682752822299227e-06, "loss": 0.0, "num_tokens": 249487005.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06449158862233162, "epoch": 0.43424, "grad_norm": 0.0, "learning_rate": 2.2678330618330307e-06, "loss": 0.0, "step": 5428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.5078125, "completions/mean_terminated_length": 221.48684692382812, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06790326163172722, "epoch": 0.43432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2673908051932943e-06, "loss": 0.0, "num_tokens": 249582686.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0709352158010006, "epoch": 0.4344, "grad_norm": 0.0, "learning_rate": 2.266948512341666e-06, "loss": 0.0, "step": 5430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.2890625, "completions/mean_terminated_length": 227.1445770263672, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.06691017001867294, "epoch": 0.43448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2665061833091017e-06, "loss": 0.0, "num_tokens": 249678595.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06365588307380676, "epoch": 0.43456, "grad_norm": 0.0, "learning_rate": 2.2660638181265603e-06, "loss": 0.0, "step": 5432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.0703125, "completions/mean_terminated_length": 216.09375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.06570800021290779, "epoch": 0.43464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2656214168250013e-06, "loss": 0.0, "num_tokens": 249773068.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06902169436216354, "epoch": 0.43472, "grad_norm": 0.0, "learning_rate": 2.26517897943539e-06, "loss": 0.0, "step": 5434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.484375, "completions/mean_terminated_length": 235.1566162109375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07551927119493484, "epoch": 0.4348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.264736505988691e-06, "loss": 0.0, "num_tokens": 249869642.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0767599530518055, "epoch": 0.43488, "grad_norm": 0.0, "learning_rate": 2.2642939965158735e-06, "loss": 0.0, "step": 5436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 212.68292236328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06770341843366623, "epoch": 0.43496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2638514510479084e-06, "loss": 0.0, "num_tokens": 249964394.0, "reward": 0.4633024334907532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4633024334907532, "rewards/reward_fn/std": 0.9901458024978638, "step": 5437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06747998669743538, "epoch": 0.43504, "grad_norm": 0.0, "learning_rate": 2.263408869615769e-06, "loss": 0.0, "step": 5438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 220.328125, "completions/mean_terminated_length": 209.4081573486328, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.0692794993519783, "epoch": 0.43512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2629662522504305e-06, "loss": 0.0, "num_tokens": 250058132.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06960506737232208, "epoch": 0.4352, "grad_norm": 0.0, "learning_rate": 2.2625235989828735e-06, "loss": 0.0, "step": 5440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.2109375, "completions/mean_terminated_length": 214.47540283203125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07217312231659889, "epoch": 0.43528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.262080909844077e-06, "loss": 0.0, "num_tokens": 250153903.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07270888611674309, "epoch": 0.43536, "grad_norm": 0.0, "learning_rate": 2.2616381848650253e-06, "loss": 0.0, "step": 5442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.984375, "completions/mean_terminated_length": 233.8518524169922, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.0685918852686882, "epoch": 0.43544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.261195424076705e-06, "loss": 0.0, "num_tokens": 250250413.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06442923843860626, "epoch": 0.43552, "grad_norm": 0.0, "learning_rate": 2.2607526275101046e-06, "loss": 0.0, "step": 5444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.296875, "completions/mean_terminated_length": 217.2533416748047, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.08160034194588661, "epoch": 0.4356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.260309795196214e-06, "loss": 0.0, "num_tokens": 250345811.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 5445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08043037354946136, "epoch": 0.43568, "grad_norm": 0.0, "learning_rate": 2.2598669271660284e-06, "loss": 0.0, "step": 5446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.609375, "completions/mean_terminated_length": 214.14584350585938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.06633376330137253, "epoch": 0.43576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2594240234505423e-06, "loss": 0.0, "num_tokens": 250440097.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.069214578717947, "epoch": 0.43584, "grad_norm": 0.0, "learning_rate": 2.258981084080756e-06, "loss": 0.0, "step": 5448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.296875, "completions/mean_terminated_length": 238.83999633789062, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07111987099051476, "epoch": 0.43592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2585381090876693e-06, "loss": 0.0, "num_tokens": 250537543.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06990885734558105, "epoch": 0.436, "grad_norm": 0.0, "learning_rate": 2.2580950985022857e-06, "loss": 0.0, "step": 5450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 231.7333526611328, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.06982055306434631, "epoch": 0.43608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2576520523556125e-06, "loss": 0.0, "num_tokens": 250634391.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06886792927980423, "epoch": 0.43616, "grad_norm": 0.0, "learning_rate": 2.2572089706786567e-06, "loss": 0.0, "step": 5452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 185.89474487304688, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06601081416010857, "epoch": 0.43624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.25676585350243e-06, "loss": 0.0, "num_tokens": 250727367.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06546357274055481, "epoch": 0.43632, "grad_norm": 0.0, "learning_rate": 2.256322700857947e-06, "loss": 0.0, "step": 5454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.2578125, "completions/mean_terminated_length": 204.2906951904297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07873000577092171, "epoch": 0.4364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2558795127762223e-06, "loss": 0.0, "num_tokens": 250821224.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07639720663428307, "epoch": 0.43648, "grad_norm": 0.0, "learning_rate": 2.2554362892882744e-06, "loss": 0.0, "step": 5456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.4453125, "completions/mean_terminated_length": 240.5189971923828, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.06591055542230606, "epoch": 0.43656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.254993030425125e-06, "loss": 0.0, "num_tokens": 250918305.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0680086798965931, "epoch": 0.43664, "grad_norm": 0.0, "learning_rate": 2.2545497362177973e-06, "loss": 0.0, "step": 5458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.265625, "completions/mean_terminated_length": 236.8307647705078, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07230296358466148, "epoch": 0.43672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.254106406697317e-06, "loss": 0.0, "num_tokens": 251015363.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0681886151432991, "epoch": 0.4368, "grad_norm": 0.0, "learning_rate": 2.2536630418947126e-06, "loss": 0.0, "step": 5460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 229.484375, "completions/mean_terminated_length": 205.34327697753906, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07129539921879768, "epoch": 0.43688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.253219641841015e-06, "loss": 0.0, "num_tokens": 251110273.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06815623492002487, "epoch": 0.43696, "grad_norm": 0.0, "learning_rate": 2.252776206567257e-06, "loss": 0.0, "step": 5462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.921875, "completions/mean_terminated_length": 225.68539428710938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07291850447654724, "epoch": 0.43704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2523327361044753e-06, "loss": 0.0, "num_tokens": 251205879.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07481937110424042, "epoch": 0.43712, "grad_norm": 0.0, "learning_rate": 2.2518892304837076e-06, "loss": 0.0, "step": 5464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.46875, "completions/mean_terminated_length": 236.3181915283203, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.06921082362532616, "epoch": 0.4372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2514456897359947e-06, "loss": 0.0, "num_tokens": 251302451.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 5465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07006517052650452, "epoch": 0.43728, "grad_norm": 0.0, "learning_rate": 2.25100211389238e-06, "loss": 0.0, "step": 5466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.9140625, "completions/mean_terminated_length": 214.06930541992188, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06641820073127747, "epoch": 0.43736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.250558502983909e-06, "loss": 0.0, "num_tokens": 251396520.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06920601427555084, "epoch": 0.43744, "grad_norm": 0.0, "learning_rate": 2.2501148570416294e-06, "loss": 0.0, "step": 5468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.3671875, "completions/mean_terminated_length": 228.80722045898438, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.07163767889142036, "epoch": 0.43752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2496711760965917e-06, "loss": 0.0, "num_tokens": 251492567.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07274815440177917, "epoch": 0.4376, "grad_norm": 0.0, "learning_rate": 2.2492274601798493e-06, "loss": 0.0, "step": 5470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.9921875, "completions/mean_terminated_length": 220.08535766601562, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06215161457657814, "epoch": 0.43768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2487837093224583e-06, "loss": 0.0, "num_tokens": 251587926.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06299431249499321, "epoch": 0.43776, "grad_norm": 0.0, "learning_rate": 2.2483399235554755e-06, "loss": 0.0, "step": 5472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.8828125, "completions/mean_terminated_length": 198.9595947265625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.06531491130590439, "epoch": 0.43784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2478961029099613e-06, "loss": 0.0, "num_tokens": 251680583.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06685145944356918, "epoch": 0.43792, "grad_norm": 0.0, "learning_rate": 2.2474522474169784e-06, "loss": 0.0, "step": 5474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.8359375, "completions/mean_terminated_length": 199.67416381835938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07074110582470894, "epoch": 0.438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2470083571075926e-06, "loss": 0.0, "num_tokens": 251773874.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07092778757214546, "epoch": 0.43808, "grad_norm": 0.0, "learning_rate": 2.246564432012871e-06, "loss": 0.0, "step": 5476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 210.3828125, "completions/mean_terminated_length": 194.53684997558594, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.06875433027744293, "epoch": 0.43816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.246120472163884e-06, "loss": 0.0, "num_tokens": 251866339.0, "reward": 0.7953383922576904, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7953383922576904, "rewards/reward_fn/std": 1.28325617313385, "step": 5477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07388237491250038, "epoch": 0.43824, "grad_norm": 0.0, "learning_rate": 2.2456764775917042e-06, "loss": 0.0, "step": 5478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.484375, "completions/mean_terminated_length": 205.7954559326172, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07263792306184769, "epoch": 0.43832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2452324483274063e-06, "loss": 0.0, "num_tokens": 251960225.0, "reward": 0.11869087815284729, "reward_std": 0.0, "rewards/reward_fn/mean": 0.11869087815284729, "rewards/reward_fn/std": 0.2931828796863556, "step": 5479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07570772618055344, "epoch": 0.4384, "grad_norm": 0.0, "learning_rate": 2.2447883844020674e-06, "loss": 0.0, "step": 5480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.9296875, "completions/mean_terminated_length": 237.83334350585938, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.07707285135984421, "epoch": 0.43848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2443442858467677e-06, "loss": 0.0, "num_tokens": 252057112.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07633257657289505, "epoch": 0.43856, "grad_norm": 0.0, "learning_rate": 2.2439001526925887e-06, "loss": 0.0, "step": 5482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7421875, "completions/mean_terminated_length": 232.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.074893057346344, "epoch": 0.43864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2434559849706157e-06, "loss": 0.0, "num_tokens": 252153463.0, "reward": 0.009990680031478405, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009990680031478405, "rewards/reward_fn/std": 0.0200558640062809, "step": 5483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07599927112460136, "epoch": 0.43872, "grad_norm": 0.0, "learning_rate": 2.2430117827119364e-06, "loss": 0.0, "step": 5484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.09375, "completions/mean_terminated_length": 217.9220733642578, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06996626406908035, "epoch": 0.4388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2425675459476395e-06, "loss": 0.0, "num_tokens": 252248835.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06922107562422752, "epoch": 0.43888, "grad_norm": 0.0, "learning_rate": 2.2421232747088163e-06, "loss": 0.0, "step": 5486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5625, "completions/mean_terminated_length": 229.5319061279297, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.06963705644011497, "epoch": 0.43896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2416789690265616e-06, "loss": 0.0, "num_tokens": 252344651.0, "reward": 0.498782753944397, "reward_std": 0.0, "rewards/reward_fn/mean": 0.498782753944397, "rewards/reward_fn/std": 1.0033233165740967, "step": 5487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0698208212852478, "epoch": 0.43904, "grad_norm": 0.0, "learning_rate": 2.2412346289319726e-06, "loss": 0.0, "step": 5488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.3984375, "completions/mean_terminated_length": 237.67369079589844, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07649003714323044, "epoch": 0.43912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2407902544561474e-06, "loss": 0.0, "num_tokens": 252441214.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07579788565635681, "epoch": 0.4392, "grad_norm": 0.0, "learning_rate": 2.240345845630189e-06, "loss": 0.0, "step": 5490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.21875, "completions/mean_terminated_length": 227.36000061035156, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06451336666941643, "epoch": 0.43928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2399014024852e-06, "loss": 0.0, "num_tokens": 252537370.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06646757572889328, "epoch": 0.43936, "grad_norm": 0.0, "learning_rate": 2.239456925052287e-06, "loss": 0.0, "step": 5492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.0390625, "completions/mean_terminated_length": 236.37179565429688, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06848396360874176, "epoch": 0.43944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.239012413362559e-06, "loss": 0.0, "num_tokens": 252634143.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06811868399381638, "epoch": 0.43952, "grad_norm": 0.0, "learning_rate": 2.2385678674471267e-06, "loss": 0.0, "step": 5494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.6015625, "completions/mean_terminated_length": 216.96250915527344, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07101733982563019, "epoch": 0.4396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.238123287337104e-06, "loss": 0.0, "num_tokens": 252729324.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07024061679840088, "epoch": 0.43968, "grad_norm": 0.0, "learning_rate": 2.2376786730636073e-06, "loss": 0.0, "step": 5496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 223.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.06492988765239716, "epoch": 0.43976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2372340246577542e-06, "loss": 0.0, "num_tokens": 252824724.0, "reward": 0.05376052483916283, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05376052483916283, "rewards/reward_fn/std": 0.1427958756685257, "step": 5497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06820549070835114, "epoch": 0.43984, "grad_norm": 0.0, "learning_rate": 2.236789342150665e-06, "loss": 0.0, "step": 5498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.03125, "completions/mean_terminated_length": 231.47474670410156, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.07276657596230507, "epoch": 0.43992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.236344625573464e-06, "loss": 0.0, "num_tokens": 252920600.0, "reward": 0.08438373357057571, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08438373357057571, "rewards/reward_fn/std": 0.224135622382164, "step": 5499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06994360685348511, "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 2.235899874957276e-06, "loss": 0.0, "step": 5500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.6953125, "completions/mean_terminated_length": 218.51806640625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.06123117357492447, "epoch": 0.44008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.235455090333228e-06, "loss": 0.0, "num_tokens": 253015793.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06643442064523697, "epoch": 0.44016, "grad_norm": 0.0, "learning_rate": 2.2350102717324517e-06, "loss": 0.0, "step": 5502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.1171875, "completions/mean_terminated_length": 195.2261962890625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06938181445002556, "epoch": 0.44024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.234565419186079e-06, "loss": 0.0, "num_tokens": 253108992.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06740817055106163, "epoch": 0.44032, "grad_norm": 0.0, "learning_rate": 2.2341205327252443e-06, "loss": 0.0, "step": 5504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.7109375, "completions/mean_terminated_length": 203.9870147705078, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07217787206172943, "epoch": 0.4404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2336756123810864e-06, "loss": 0.0, "num_tokens": 253203291.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 5505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06990155577659607, "epoch": 0.44048, "grad_norm": 0.0, "learning_rate": 2.233230658184744e-06, "loss": 0.0, "step": 5506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.140625, "completions/mean_terminated_length": 232.96104431152344, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07622037827968597, "epoch": 0.44056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2327856701673592e-06, "loss": 0.0, "num_tokens": 253299821.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07626662030816078, "epoch": 0.44064, "grad_norm": 0.0, "learning_rate": 2.232340648360077e-06, "loss": 0.0, "step": 5508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.453125, "completions/mean_terminated_length": 212.9589080810547, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06769896671175957, "epoch": 0.44072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2318955927940437e-06, "loss": 0.0, "num_tokens": 253394983.0, "reward": 0.426705539226532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.426705539226532, "rewards/reward_fn/std": 0.9858514070510864, "step": 5509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07306424900889397, "epoch": 0.4408, "grad_norm": 0.0, "learning_rate": 2.2314505035004087e-06, "loss": 0.0, "step": 5510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.3359375, "completions/mean_terminated_length": 211.4558868408203, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07042254880070686, "epoch": 0.44088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.231005380510323e-06, "loss": 0.0, "num_tokens": 253490258.0, "reward": 0.7473070621490479, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7473070621490479, "rewards/reward_fn/std": 1.093409538269043, "step": 5511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07224331423640251, "epoch": 0.44096, "grad_norm": 0.0, "learning_rate": 2.2305602238549422e-06, "loss": 0.0, "step": 5512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.1328125, "completions/mean_terminated_length": 205.04286193847656, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06187395192682743, "epoch": 0.44104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2301150335654206e-06, "loss": 0.0, "num_tokens": 253584995.0, "reward": 0.4593837261199951, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4593837261199951, "rewards/reward_fn/std": 0.9892303347587585, "step": 5513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0629935972392559, "epoch": 0.44112, "grad_norm": 0.0, "learning_rate": 2.2296698096729178e-06, "loss": 0.0, "step": 5514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.140625, "completions/mean_terminated_length": 216.78846740722656, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.06685071811079979, "epoch": 0.4412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2292245522085945e-06, "loss": 0.0, "num_tokens": 253679221.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06600194424390793, "epoch": 0.44128, "grad_norm": 0.0, "learning_rate": 2.2287792612036146e-06, "loss": 0.0, "step": 5516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.4453125, "completions/mean_terminated_length": 234.03797912597656, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.07110782340168953, "epoch": 0.44136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2283339366891424e-06, "loss": 0.0, "num_tokens": 253775790.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07315943390130997, "epoch": 0.44144, "grad_norm": 0.0, "learning_rate": 2.2278885786963472e-06, "loss": 0.0, "step": 5518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.890625, "completions/mean_terminated_length": 225.35714721679688, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06384091824293137, "epoch": 0.44152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.227443187256399e-06, "loss": 0.0, "num_tokens": 253871520.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06531821191310883, "epoch": 0.4416, "grad_norm": 0.0, "learning_rate": 2.226997762400471e-06, "loss": 0.0, "step": 5520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.140625, "completions/mean_terminated_length": 173.44117736816406, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.05688625946640968, "epoch": 0.44168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.226552304159737e-06, "loss": 0.0, "num_tokens": 253964210.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05199199914932251, "epoch": 0.44176, "grad_norm": 0.0, "learning_rate": 2.226106812565375e-06, "loss": 0.0, "step": 5522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.453125, "completions/mean_terminated_length": 239.6271209716797, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.06755665317177773, "epoch": 0.44184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.225661287648565e-06, "loss": 0.0, "num_tokens": 254061548.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06463119760155678, "epoch": 0.44192, "grad_norm": 0.0, "learning_rate": 2.2252157294404886e-06, "loss": 0.0, "step": 5524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.015625, "completions/mean_terminated_length": 236.3384552001953, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.0647052451968193, "epoch": 0.442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.224770137972331e-06, "loss": 0.0, "num_tokens": 254158574.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06414960511028767, "epoch": 0.44208, "grad_norm": 0.0, "learning_rate": 2.2243245132752776e-06, "loss": 0.0, "step": 5526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.8203125, "completions/mean_terminated_length": 240.78846740722656, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07029587402939796, "epoch": 0.44216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.223878855380518e-06, "loss": 0.0, "num_tokens": 254256087.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 5527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0738796517252922, "epoch": 0.44224, "grad_norm": 0.0, "learning_rate": 2.223433164319244e-06, "loss": 0.0, "step": 5528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.046875, "completions/mean_terminated_length": 233.89334106445312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.06851592659950256, "epoch": 0.44232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2229874401226486e-06, "loss": 0.0, "num_tokens": 254352733.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06649716198444366, "epoch": 0.4424, "grad_norm": 0.0, "learning_rate": 2.2225416828219283e-06, "loss": 0.0, "step": 5530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.9140625, "completions/mean_terminated_length": 215.48245239257812, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07273327559232712, "epoch": 0.44248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2220958924482814e-06, "loss": 0.0, "num_tokens": 254446418.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07462961599230766, "epoch": 0.44256, "grad_norm": 0.0, "learning_rate": 2.2216500690329084e-06, "loss": 0.0, "step": 5532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.0078125, "completions/mean_terminated_length": 235.60870361328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.07285628467798233, "epoch": 0.44264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2212042126070114e-06, "loss": 0.0, "num_tokens": 254543315.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07680651918053627, "epoch": 0.44272, "grad_norm": 0.0, "learning_rate": 2.220758323201797e-06, "loss": 0.0, "step": 5534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.703125, "completions/mean_terminated_length": 212.78260803222656, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07112118229269981, "epoch": 0.4428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2203124008484715e-06, "loss": 0.0, "num_tokens": 254638637.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06860165297985077, "epoch": 0.44288, "grad_norm": 0.0, "learning_rate": 2.2198664455782455e-06, "loss": 0.0, "step": 5536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 245.1953125, "completions/mean_terminated_length": 235.3582000732422, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07380636781454086, "epoch": 0.44296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2194204574223318e-06, "loss": 0.0, "num_tokens": 254735558.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0738460160791874, "epoch": 0.44304, "grad_norm": 0.0, "learning_rate": 2.2189744364119433e-06, "loss": 0.0, "step": 5538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.921875, "completions/mean_terminated_length": 212.76087951660156, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06785047799348831, "epoch": 0.44312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2185283825782985e-06, "loss": 0.0, "num_tokens": 254829884.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06433628126978874, "epoch": 0.4432, "grad_norm": 0.0, "learning_rate": 2.2180822959526146e-06, "loss": 0.0, "step": 5540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.0234375, "completions/mean_terminated_length": 235.6704559326172, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06877413019537926, "epoch": 0.44328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.217636176566114e-06, "loss": 0.0, "num_tokens": 254926399.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06737992540001869, "epoch": 0.44336, "grad_norm": 0.0, "learning_rate": 2.2171900244500205e-06, "loss": 0.0, "step": 5542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.421875, "completions/mean_terminated_length": 220.28302001953125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07000569254159927, "epoch": 0.44344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.216743839635559e-06, "loss": 0.0, "num_tokens": 255020917.0, "reward": 0.04093467444181442, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04093467444181442, "rewards/reward_fn/std": 0.10872851312160492, "step": 5543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0702623799443245, "epoch": 0.44352, "grad_norm": 0.0, "learning_rate": 2.2162976221539596e-06, "loss": 0.0, "step": 5544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.1953125, "completions/mean_terminated_length": 240.8636474609375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.07418393343687057, "epoch": 0.4436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2158513720364512e-06, "loss": 0.0, "num_tokens": 255118222.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07125095278024673, "epoch": 0.44368, "grad_norm": 0.0, "learning_rate": 2.215405089314267e-06, "loss": 0.0, "step": 5546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.859375, "completions/mean_terminated_length": 232.7948760986328, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.06880029290914536, "epoch": 0.44376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.214958774018642e-06, "loss": 0.0, "num_tokens": 255214716.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06572695076465607, "epoch": 0.44384, "grad_norm": 0.0, "learning_rate": 2.214512426180814e-06, "loss": 0.0, "step": 5548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.8515625, "completions/mean_terminated_length": 226.59494018554688, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.060060760006308556, "epoch": 0.44392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2140660458320223e-06, "loss": 0.0, "num_tokens": 255310697.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0616269875317812, "epoch": 0.444, "grad_norm": 0.0, "learning_rate": 2.2136196330035087e-06, "loss": 0.0, "step": 5550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.0546875, "completions/mean_terminated_length": 219.30850219726562, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06901959329843521, "epoch": 0.44408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.213173187726518e-06, "loss": 0.0, "num_tokens": 255405552.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 5551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07082848623394966, "epoch": 0.44416, "grad_norm": 0.0, "learning_rate": 2.212726710032296e-06, "loss": 0.0, "step": 5552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.59375, "completions/mean_terminated_length": 239.45834350585938, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.0642697885632515, "epoch": 0.44424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.212280199952091e-06, "loss": 0.0, "num_tokens": 255502268.0, "reward": 0.869311511516571, "reward_std": 0.0, "rewards/reward_fn/mean": 0.869311511516571, "rewards/reward_fn/std": 1.273123025894165, "step": 5553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06612906605005264, "epoch": 0.44432, "grad_norm": 0.0, "learning_rate": 2.211833657517155e-06, "loss": 0.0, "step": 5554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.4296875, "completions/mean_terminated_length": 218.68919372558594, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.0714486613869667, "epoch": 0.4444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.211387082758741e-06, "loss": 0.0, "num_tokens": 255597811.0, "reward": 0.10006237775087357, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10006237775087357, "rewards/reward_fn/std": 0.26578041911125183, "step": 5555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0659283921122551, "epoch": 0.44448, "grad_norm": 0.0, "learning_rate": 2.2109404757081046e-06, "loss": 0.0, "step": 5556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.7109375, "completions/mean_terminated_length": 232.702392578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.07011257484555244, "epoch": 0.44456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.210493836396503e-06, "loss": 0.0, "num_tokens": 255694158.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0693213976919651, "epoch": 0.44464, "grad_norm": 0.0, "learning_rate": 2.2100471648551962e-06, "loss": 0.0, "step": 5558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.5390625, "completions/mean_terminated_length": 212.5466766357422, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.07818996906280518, "epoch": 0.44472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2096004611154475e-06, "loss": 0.0, "num_tokens": 255789203.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07675136253237724, "epoch": 0.4448, "grad_norm": 0.0, "learning_rate": 2.2091537252085206e-06, "loss": 0.0, "step": 5560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5859375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.421875, "completions/mean_terminated_length": 235.28302001953125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.06645815446972847, "epoch": 0.44488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2087069571656825e-06, "loss": 0.0, "num_tokens": 255886409.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06683522462844849, "epoch": 0.44496, "grad_norm": 0.0, "learning_rate": 2.208260157018202e-06, "loss": 0.0, "step": 5562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 242.19566345214844, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.06477542966604233, "epoch": 0.44504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2078133247973514e-06, "loss": 0.0, "num_tokens": 255983443.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07014409825205803, "epoch": 0.44512, "grad_norm": 0.0, "learning_rate": 2.207366460534403e-06, "loss": 0.0, "step": 5564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.734375, "completions/mean_terminated_length": 240.35714721679688, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07012141868472099, "epoch": 0.4452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.206919564260633e-06, "loss": 0.0, "num_tokens": 256080433.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07056662067770958, "epoch": 0.44528, "grad_norm": 0.0, "learning_rate": 2.20647263600732e-06, "loss": 0.0, "step": 5566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.1953125, "completions/mean_terminated_length": 240.42266845703125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.06200640834867954, "epoch": 0.44536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2060256758057433e-06, "loss": 0.0, "num_tokens": 256177226.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06325728259980679, "epoch": 0.44544, "grad_norm": 0.0, "learning_rate": 2.205578683687186e-06, "loss": 0.0, "step": 5568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.1484375, "completions/mean_terminated_length": 212.49368286132812, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.06572835892438889, "epoch": 0.44552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.205131659682932e-06, "loss": 0.0, "num_tokens": 256272093.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06318297050893307, "epoch": 0.4456, "grad_norm": 0.0, "learning_rate": 2.2046846038242693e-06, "loss": 0.0, "step": 5570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.6171875, "completions/mean_terminated_length": 221.06451416015625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06975320354104042, "epoch": 0.44568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2042375161424868e-06, "loss": 0.0, "num_tokens": 256367148.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07082987576723099, "epoch": 0.44576, "grad_norm": 0.0, "learning_rate": 2.2037903966688764e-06, "loss": 0.0, "step": 5572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.1484375, "completions/mean_terminated_length": 202.8928680419922, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.06681742519140244, "epoch": 0.44584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2033432454347306e-06, "loss": 0.0, "num_tokens": 256460991.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 5573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06643757596611977, "epoch": 0.44592, "grad_norm": 0.0, "learning_rate": 2.202896062471346e-06, "loss": 0.0, "step": 5574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.046875, "completions/mean_terminated_length": 212.8915557861328, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06614401564002037, "epoch": 0.446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2024488478100203e-06, "loss": 0.0, "num_tokens": 256555717.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06768212467432022, "epoch": 0.44608, "grad_norm": 0.0, "learning_rate": 2.202001601482054e-06, "loss": 0.0, "step": 5576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6015625, "completions/mean_terminated_length": 239.05633544921875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07438862323760986, "epoch": 0.44616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2015543235187495e-06, "loss": 0.0, "num_tokens": 256652818.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07075132802128792, "epoch": 0.44624, "grad_norm": 0.0, "learning_rate": 2.201107013951412e-06, "loss": 0.0, "step": 5578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.1953125, "completions/mean_terminated_length": 236.91510009765625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.06697085872292519, "epoch": 0.44632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.200659672811348e-06, "loss": 0.0, "num_tokens": 256749099.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06758426502346992, "epoch": 0.4464, "grad_norm": 0.0, "learning_rate": 2.200212300129867e-06, "loss": 0.0, "step": 5580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.921875, "completions/mean_terminated_length": 201.03297424316406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06867717951536179, "epoch": 0.44648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.19976489593828e-06, "loss": 0.0, "num_tokens": 256842401.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06950914487242699, "epoch": 0.44656, "grad_norm": 0.0, "learning_rate": 2.1993174602679005e-06, "loss": 0.0, "step": 5582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.5703125, "completions/mean_terminated_length": 233.55682373046875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.061918992549180984, "epoch": 0.44664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1988699931500444e-06, "loss": 0.0, "num_tokens": 256938730.0, "reward": 0.25, "reward_std": 0.0, "rewards/reward_fn/mean": 0.25, "rewards/reward_fn/std": 0.6640368103981018, "step": 5583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06454957276582718, "epoch": 0.44672, "grad_norm": 0.0, "learning_rate": 2.1984224946160298e-06, "loss": 0.0, "step": 5584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.3515625, "completions/mean_terminated_length": 186.7662353515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07826124504208565, "epoch": 0.4468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1979749646971773e-06, "loss": 0.0, "num_tokens": 257031703.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07419582456350327, "epoch": 0.44688, "grad_norm": 0.0, "learning_rate": 2.1975274034248087e-06, "loss": 0.0, "step": 5586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.28125, "completions/mean_terminated_length": 225.51724243164062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.07046199962496758, "epoch": 0.44696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.197079810830248e-06, "loss": 0.0, "num_tokens": 257127355.0, "reward": 0.01492841262370348, "reward_std": 0.0, "rewards/reward_fn/mean": 0.01492841262370348, "rewards/reward_fn/std": 0.039652060717344284, "step": 5587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06716042011976242, "epoch": 0.44704, "grad_norm": 0.0, "learning_rate": 2.1966321869448233e-06, "loss": 0.0, "step": 5588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.984375, "completions/mean_terminated_length": 233.5749969482422, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.07364282384514809, "epoch": 0.44712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1961845317998623e-06, "loss": 0.0, "num_tokens": 257223865.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07312272861599922, "epoch": 0.4472, "grad_norm": 0.0, "learning_rate": 2.1957368454266964e-06, "loss": 0.0, "step": 5590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.578125, "completions/mean_terminated_length": 240.14706420898438, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.06402158737182617, "epoch": 0.44728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1952891278566603e-06, "loss": 0.0, "num_tokens": 257321091.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06453599035739899, "epoch": 0.44736, "grad_norm": 0.0, "learning_rate": 2.1948413791210876e-06, "loss": 0.0, "step": 5592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.3046875, "completions/mean_terminated_length": 233.65670776367188, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.07676096260547638, "epoch": 0.44744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1943935992513175e-06, "loss": 0.0, "num_tokens": 257417898.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07908939570188522, "epoch": 0.44752, "grad_norm": 0.0, "learning_rate": 2.193945788278688e-06, "loss": 0.0, "step": 5594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.0390625, "completions/mean_terminated_length": 230.13925170898438, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.0718935877084732, "epoch": 0.4476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.193497946234543e-06, "loss": 0.0, "num_tokens": 257514159.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07329962775111198, "epoch": 0.44768, "grad_norm": 0.0, "learning_rate": 2.193050073150226e-06, "loss": 0.0, "step": 5596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.3828125, "completions/mean_terminated_length": 200.42373657226562, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.062401341274380684, "epoch": 0.44776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1926021690570825e-06, "loss": 0.0, "num_tokens": 257609184.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06248939968645573, "epoch": 0.44784, "grad_norm": 0.0, "learning_rate": 2.192154233986463e-06, "loss": 0.0, "step": 5598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.78125, "completions/mean_terminated_length": 225.629638671875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.06669394671916962, "epoch": 0.44792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.191706267969717e-06, "loss": 0.0, "num_tokens": 257705028.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06767382100224495, "epoch": 0.448, "grad_norm": 0.0, "learning_rate": 2.1912582710381974e-06, "loss": 0.0, "step": 5600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.078125, "completions/mean_terminated_length": 239.04444885253906, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.06778708845376968, "epoch": 0.44808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.190810243223259e-06, "loss": 0.0, "num_tokens": 257801806.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06816000118851662, "epoch": 0.44816, "grad_norm": 0.0, "learning_rate": 2.19036218455626e-06, "loss": 0.0, "step": 5602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.0078125, "completions/mean_terminated_length": 208.25372314453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.05999820865690708, "epoch": 0.44824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1899140950685584e-06, "loss": 0.0, "num_tokens": 257896911.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 5603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057036394253373146, "epoch": 0.44832, "grad_norm": 0.0, "learning_rate": 2.1894659747915175e-06, "loss": 0.0, "step": 5604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.203125, "completions/mean_terminated_length": 231.2458953857422, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07079143077135086, "epoch": 0.4484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1890178237564996e-06, "loss": 0.0, "num_tokens": 257993705.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07235733047127724, "epoch": 0.44848, "grad_norm": 0.0, "learning_rate": 2.1885696419948707e-06, "loss": 0.0, "step": 5606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.515625, "completions/mean_terminated_length": 238.10667419433594, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.0695977695286274, "epoch": 0.44856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1881214295379993e-06, "loss": 0.0, "num_tokens": 258090667.0, "reward": 0.10706061124801636, "reward_std": 0.0, "rewards/reward_fn/mean": 0.10706061124801636, "rewards/reward_fn/std": 0.28436875343322754, "step": 5607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0733751468360424, "epoch": 0.44864, "grad_norm": 0.0, "learning_rate": 2.1876731864172554e-06, "loss": 0.0, "step": 5608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.2734375, "completions/mean_terminated_length": 236.13414001464844, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.07022570446133614, "epoch": 0.44872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1872249126640105e-06, "loss": 0.0, "num_tokens": 258187342.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0708128958940506, "epoch": 0.4488, "grad_norm": 0.0, "learning_rate": 2.1867766083096403e-06, "loss": 0.0, "step": 5610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.296875, "completions/mean_terminated_length": 236.87059020996094, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.07017727568745613, "epoch": 0.44888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1863282733855204e-06, "loss": 0.0, "num_tokens": 258284020.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883497953414917, "step": 5611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07038086652755737, "epoch": 0.44896, "grad_norm": 0.0, "learning_rate": 2.18587990792303e-06, "loss": 0.0, "step": 5612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 217.984375, "completions/mean_terminated_length": 201.3258514404297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06445904821157455, "epoch": 0.44904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1854315119535502e-06, "loss": 0.0, "num_tokens": 258377458.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0665384829044342, "epoch": 0.44912, "grad_norm": 0.0, "learning_rate": 2.184983085508463e-06, "loss": 0.0, "step": 5614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.546875, "completions/mean_terminated_length": 215.72308349609375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06965912505984306, "epoch": 0.4492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1845346286191542e-06, "loss": 0.0, "num_tokens": 258473144.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07334906607866287, "epoch": 0.44928, "grad_norm": 0.0, "learning_rate": 2.1840861413170114e-06, "loss": 0.0, "step": 5616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.3515625, "completions/mean_terminated_length": 214.58570861816406, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07202469930052757, "epoch": 0.44936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.183637623633423e-06, "loss": 0.0, "num_tokens": 258568549.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07177786901593208, "epoch": 0.44944, "grad_norm": 0.0, "learning_rate": 2.1831890755997815e-06, "loss": 0.0, "step": 5618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.0546875, "completions/mean_terminated_length": 231.1097412109375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07079891115427017, "epoch": 0.44952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.18274049724748e-06, "loss": 0.0, "num_tokens": 258664812.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07197948545217514, "epoch": 0.4496, "grad_norm": 0.0, "learning_rate": 2.182291888607914e-06, "loss": 0.0, "step": 5620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.7578125, "completions/mean_terminated_length": 212.93589782714844, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.066353939473629, "epoch": 0.44968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1818432497124814e-06, "loss": 0.0, "num_tokens": 258759757.0, "reward": 0.002499666763469577, "reward_std": 0.0, "rewards/reward_fn/mean": 0.002499666763469577, "rewards/reward_fn/std": 0.006639483384788036, "step": 5621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06312483176589012, "epoch": 0.44976, "grad_norm": 0.0, "learning_rate": 2.181394580592583e-06, "loss": 0.0, "step": 5622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 200.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07384161278605461, "epoch": 0.44984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.18094588127962e-06, "loss": 0.0, "num_tokens": 258853133.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 5623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143349573016167, "epoch": 0.44992, "grad_norm": 0.0, "learning_rate": 2.180497151804997e-06, "loss": 0.0, "step": 5624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.484375, "completions/mean_terminated_length": 237.34178161621094, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.07048334181308746, "epoch": 0.45, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.18004839220012e-06, "loss": 0.0, "num_tokens": 258949963.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07362429052591324, "epoch": 0.45008, "grad_norm": 0.0, "learning_rate": 2.1795996024963985e-06, "loss": 0.0, "step": 5626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 223.8125, "completions/mean_terminated_length": 214.79998779296875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.06902608647942543, "epoch": 0.45016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.179150782725242e-06, "loss": 0.0, "num_tokens": 259044147.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07059457898139954, "epoch": 0.45024, "grad_norm": 0.0, "learning_rate": 2.1787019329180632e-06, "loss": 0.0, "step": 5628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.9375, "completions/mean_terminated_length": 202.82925415039062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06643674895167351, "epoch": 0.45032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1782530531062775e-06, "loss": 0.0, "num_tokens": 259138091.0, "reward": 0.4990789294242859, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4990789294242859, "rewards/reward_fn/std": 1.0034698247909546, "step": 5629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06685441359877586, "epoch": 0.4504, "grad_norm": 0.0, "learning_rate": 2.1778041433213004e-06, "loss": 0.0, "step": 5630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.1953125, "completions/mean_terminated_length": 220.15293884277344, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.0725826844573021, "epoch": 0.45048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1773552035945523e-06, "loss": 0.0, "num_tokens": 259233348.0, "reward": 0.4347124993801117, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4347124993801117, "rewards/reward_fn/std": 0.9859739542007446, "step": 5631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07445170357823372, "epoch": 0.45056, "grad_norm": 0.0, "learning_rate": 2.1769062339574533e-06, "loss": 0.0, "step": 5632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.578125, "completions/mean_terminated_length": 221.47222900390625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07330146431922913, "epoch": 0.45064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1764572344414273e-06, "loss": 0.0, "num_tokens": 259329166.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0722239688038826, "epoch": 0.45072, "grad_norm": 0.0, "learning_rate": 2.1760082050778995e-06, "loss": 0.0, "step": 5634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.9921875, "completions/mean_terminated_length": 195.26922607421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06733502447605133, "epoch": 0.4508, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.175559145898296e-06, "loss": 0.0, "num_tokens": 259422733.0, "reward": 0.4366234838962555, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4366234838962555, "rewards/reward_fn/std": 0.9860709309577942, "step": 5635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06346135213971138, "epoch": 0.45088, "grad_norm": 0.0, "learning_rate": 2.175110056934047e-06, "loss": 0.0, "step": 5636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.2421875, "completions/mean_terminated_length": 234.8028106689453, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.07050995528697968, "epoch": 0.45096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1746609382165847e-06, "loss": 0.0, "num_tokens": 259519532.0, "reward": 0.08830241858959198, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08830241858959198, "rewards/reward_fn/std": 0.23454421758651733, "step": 5637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07081001251935959, "epoch": 0.45104, "grad_norm": 0.0, "learning_rate": 2.1742117897773414e-06, "loss": 0.0, "step": 5638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.46875, "completions/mean_terminated_length": 216.25926208496094, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.06984411552548409, "epoch": 0.45112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.173762611647754e-06, "loss": 0.0, "num_tokens": 259613544.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0689382553100586, "epoch": 0.4512, "grad_norm": 0.0, "learning_rate": 2.1733134038592595e-06, "loss": 0.0, "step": 5640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.2734375, "completions/mean_terminated_length": 228.76136779785156, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.0646895058453083, "epoch": 0.45128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1728641664432975e-06, "loss": 0.0, "num_tokens": 259709451.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06652893126010895, "epoch": 0.45136, "grad_norm": 0.0, "learning_rate": 2.17241489943131e-06, "loss": 0.0, "step": 5642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.7109375, "completions/mean_terminated_length": 234.01124572753906, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06713293492794037, "epoch": 0.45144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.171965602854741e-06, "loss": 0.0, "num_tokens": 259805798.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 5643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06444654241204262, "epoch": 0.45152, "grad_norm": 0.0, "learning_rate": 2.1715162767450367e-06, "loss": 0.0, "step": 5644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.734375, "completions/mean_terminated_length": 225.0681915283203, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07248319685459137, "epoch": 0.4516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1710669211336453e-06, "loss": 0.0, "num_tokens": 259901380.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07101354002952576, "epoch": 0.45168, "grad_norm": 0.0, "learning_rate": 2.1706175360520172e-06, "loss": 0.0, "step": 5646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.4140625, "completions/mean_terminated_length": 228.7820587158203, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.06826495751738548, "epoch": 0.45176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1701681215316032e-06, "loss": 0.0, "num_tokens": 259997561.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07250524312257767, "epoch": 0.45184, "grad_norm": 0.0, "learning_rate": 2.169718677603859e-06, "loss": 0.0, "step": 5648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.96875, "completions/mean_terminated_length": 218.30589294433594, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06337684392929077, "epoch": 0.45192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.16926920430024e-06, "loss": 0.0, "num_tokens": 260092661.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0619119256734848, "epoch": 0.452, "grad_norm": 0.0, "learning_rate": 2.168819701652205e-06, "loss": 0.0, "step": 5650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.9453125, "completions/mean_terminated_length": 199.92709350585938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06962292641401291, "epoch": 0.45208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.168370169691215e-06, "loss": 0.0, "num_tokens": 260185582.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06773530319333076, "epoch": 0.45216, "grad_norm": 0.0, "learning_rate": 2.1679206084487316e-06, "loss": 0.0, "step": 5652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.6015625, "completions/mean_terminated_length": 234.5178680419922, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.06759331375360489, "epoch": 0.45224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1674710179562204e-06, "loss": 0.0, "num_tokens": 260282683.0, "reward": 0.426705539226532, "reward_std": 0.0, "rewards/reward_fn/mean": 0.426705539226532, "rewards/reward_fn/std": 0.9858514070510864, "step": 5653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06882346794009209, "epoch": 0.45232, "grad_norm": 0.0, "learning_rate": 2.1670213982451462e-06, "loss": 0.0, "step": 5654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.0703125, "completions/mean_terminated_length": 236.0602264404297, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07086677476763725, "epoch": 0.4524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1665717493469795e-06, "loss": 0.0, "num_tokens": 260379332.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07219372317194939, "epoch": 0.45248, "grad_norm": 0.0, "learning_rate": 2.1661220712931895e-06, "loss": 0.0, "step": 5656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.8515625, "completions/mean_terminated_length": 219.60606384277344, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.06815556436777115, "epoch": 0.45256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.16567236411525e-06, "loss": 0.0, "num_tokens": 260474033.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06774311512708664, "epoch": 0.45264, "grad_norm": 0.0, "learning_rate": 2.1652226278446355e-06, "loss": 0.0, "step": 5658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 230.2337646484375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.07147365435957909, "epoch": 0.45272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.164772862512822e-06, "loss": 0.0, "num_tokens": 260570353.0, "reward": 0.08953723311424255, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08953723311424255, "rewards/reward_fn/std": 0.23782405257225037, "step": 5659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0708366222679615, "epoch": 0.4528, "grad_norm": 0.0, "learning_rate": 2.16432306815129e-06, "loss": 0.0, "step": 5660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.609375, "completions/mean_terminated_length": 210.6999969482422, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06735096499323845, "epoch": 0.45288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1638732447915185e-06, "loss": 0.0, "num_tokens": 260664127.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0674101822078228, "epoch": 0.45296, "grad_norm": 0.0, "learning_rate": 2.163423392464991e-06, "loss": 0.0, "step": 5662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.8671875, "completions/mean_terminated_length": 216.93138122558594, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.061381746083498, "epoch": 0.45304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.162973511203193e-06, "loss": 0.0, "num_tokens": 260758446.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06407180055975914, "epoch": 0.45312, "grad_norm": 0.0, "learning_rate": 2.1625236010376112e-06, "loss": 0.0, "step": 5664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.9921875, "completions/mean_terminated_length": 217.41094970703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07587289810180664, "epoch": 0.4532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.162073661999734e-06, "loss": 0.0, "num_tokens": 260853933.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07637567073106766, "epoch": 0.45328, "grad_norm": 0.0, "learning_rate": 2.1616236941210532e-06, "loss": 0.0, "step": 5666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.1796875, "completions/mean_terminated_length": 208.63954162597656, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.0682605728507042, "epoch": 0.45336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.161173697433061e-06, "loss": 0.0, "num_tokens": 260948164.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0703759714961052, "epoch": 0.45344, "grad_norm": 0.0, "learning_rate": 2.160723671967253e-06, "loss": 0.0, "step": 5668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 211.12643432617188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06515209376811981, "epoch": 0.45352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.160273617755126e-06, "loss": 0.0, "num_tokens": 261042564.0, "reward": 0.7673865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7673865556716919, "rewards/reward_fn/std": 1.2948493957519531, "step": 5669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06900715082883835, "epoch": 0.4536, "grad_norm": 0.0, "learning_rate": 2.159823534828179e-06, "loss": 0.0, "step": 5670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.046875, "completions/mean_terminated_length": 235.11474609375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.06947439908981323, "epoch": 0.45368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1593734232179135e-06, "loss": 0.0, "num_tokens": 261139594.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06908228248357773, "epoch": 0.45376, "grad_norm": 0.0, "learning_rate": 2.1589232829558315e-06, "loss": 0.0, "step": 5672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.359375, "completions/mean_terminated_length": 197.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.072773028165102, "epoch": 0.45384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1584731140734393e-06, "loss": 0.0, "num_tokens": 261233208.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0802743211388588, "epoch": 0.45392, "grad_norm": 0.0, "learning_rate": 2.158022916602243e-06, "loss": 0.0, "step": 5674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.2890625, "completions/mean_terminated_length": 221.8000030517578, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.07445190846920013, "epoch": 0.454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.157572690573752e-06, "loss": 0.0, "num_tokens": 261328605.0, "reward": 0.21281403303146362, "reward_std": 0.0, "rewards/reward_fn/mean": 0.21281403303146362, "rewards/reward_fn/std": 0.3718811571598053, "step": 5675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07126553356647491, "epoch": 0.45408, "grad_norm": 0.0, "learning_rate": 2.157122436019478e-06, "loss": 0.0, "step": 5676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 208.171875, "completions/mean_terminated_length": 192.2291717529297, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07024645060300827, "epoch": 0.45416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1566721529709334e-06, "loss": 0.0, "num_tokens": 261420787.0, "reward": 0.49993911385536194, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49993911385536194, "rewards/reward_fn/std": 1.0038987398147583, "step": 5677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06987990438938141, "epoch": 0.45424, "grad_norm": 0.0, "learning_rate": 2.156221841459633e-06, "loss": 0.0, "step": 5678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.453125, "completions/mean_terminated_length": 214.30001831054688, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07200530543923378, "epoch": 0.45432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.155771501517094e-06, "loss": 0.0, "num_tokens": 261516589.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06730147078633308, "epoch": 0.4544, "grad_norm": 0.0, "learning_rate": 2.1553211331748363e-06, "loss": 0.0, "step": 5680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.6796875, "completions/mean_terminated_length": 239.2784881591797, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.0756622962653637, "epoch": 0.45448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.15487073646438e-06, "loss": 0.0, "num_tokens": 261613572.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07612700015306473, "epoch": 0.45456, "grad_norm": 0.0, "learning_rate": 2.1544203114172486e-06, "loss": 0.0, "step": 5682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.984375, "completions/mean_terminated_length": 225.36956787109375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.0672759860754013, "epoch": 0.45464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1539698580649668e-06, "loss": 0.0, "num_tokens": 261709058.0, "reward": 0.46573716402053833, "reward_std": 0.0, "rewards/reward_fn/mean": 0.46573716402053833, "rewards/reward_fn/std": 0.9907692074775696, "step": 5683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0684817023575306, "epoch": 0.45472, "grad_norm": 0.0, "learning_rate": 2.1535193764390614e-06, "loss": 0.0, "step": 5684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.4609375, "completions/mean_terminated_length": 217.13829040527344, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06275155767798424, "epoch": 0.4548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1530688665710613e-06, "loss": 0.0, "num_tokens": 261803709.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06326929107308388, "epoch": 0.45488, "grad_norm": 0.0, "learning_rate": 2.1526183284924988e-06, "loss": 0.0, "step": 5686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.5234375, "completions/mean_terminated_length": 182.60000610351562, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.0700039230287075, "epoch": 0.45496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.152167762234905e-06, "loss": 0.0, "num_tokens": 261895040.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07209782302379608, "epoch": 0.45504, "grad_norm": 0.0, "learning_rate": 2.1517171678298155e-06, "loss": 0.0, "step": 5688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.78125, "completions/mean_terminated_length": 234.07228088378906, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.06924354657530785, "epoch": 0.45512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.151266545308768e-06, "loss": 0.0, "num_tokens": 261991524.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06782825291156769, "epoch": 0.4552, "grad_norm": 0.0, "learning_rate": 2.1508158947032996e-06, "loss": 0.0, "step": 5690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.1796875, "completions/mean_terminated_length": 216.91026306152344, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06806235760450363, "epoch": 0.45528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1503652160449523e-06, "loss": 0.0, "num_tokens": 262086779.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06643253937363625, "epoch": 0.45536, "grad_norm": 0.0, "learning_rate": 2.149914509365268e-06, "loss": 0.0, "step": 5692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.921875, "completions/mean_terminated_length": 226.3076934814453, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.0752897597849369, "epoch": 0.45544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1494637746957927e-06, "loss": 0.0, "num_tokens": 262183153.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 5693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07327689602971077, "epoch": 0.45552, "grad_norm": 0.0, "learning_rate": 2.1490130120680725e-06, "loss": 0.0, "step": 5694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.859375, "completions/mean_terminated_length": 221.28089904785156, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.06319857947528362, "epoch": 0.4556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1485622215136553e-06, "loss": 0.0, "num_tokens": 262278367.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06187349371612072, "epoch": 0.45568, "grad_norm": 0.0, "learning_rate": 2.1481114030640922e-06, "loss": 0.0, "step": 5696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.84375, "completions/mean_terminated_length": 234.08450317382812, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07249414175748825, "epoch": 0.45576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1476605567509356e-06, "loss": 0.0, "num_tokens": 262375115.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07134635001420975, "epoch": 0.45584, "grad_norm": 0.0, "learning_rate": 2.1472096826057405e-06, "loss": 0.0, "step": 5698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.0859375, "completions/mean_terminated_length": 234.05746459960938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.0752083845436573, "epoch": 0.45592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.146758780660063e-06, "loss": 0.0, "num_tokens": 262471510.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 1.5058939456939697, "step": 5699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07634678483009338, "epoch": 0.456, "grad_norm": 0.0, "learning_rate": 2.1463078509454616e-06, "loss": 0.0, "step": 5700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.03125, "completions/mean_terminated_length": 211.81817626953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06690427660942078, "epoch": 0.45608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1458568934934957e-06, "loss": 0.0, "num_tokens": 262564954.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06916900351643562, "epoch": 0.45616, "grad_norm": 0.0, "learning_rate": 2.1454059083357284e-06, "loss": 0.0, "step": 5702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.5078125, "completions/mean_terminated_length": 217.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.0766788087785244, "epoch": 0.45624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1449548955037243e-06, "loss": 0.0, "num_tokens": 262659739.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07465716078877449, "epoch": 0.45632, "grad_norm": 0.0, "learning_rate": 2.1445038550290482e-06, "loss": 0.0, "step": 5704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 215.0625, "completions/mean_terminated_length": 203.07070922851562, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.08372475951910019, "epoch": 0.4564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.14405278694327e-06, "loss": 0.0, "num_tokens": 262752803.0, "reward": 1.23334801197052, "reward_std": 0.0, "rewards/reward_fn/mean": 1.23334801197052, "rewards/reward_fn/std": 1.4011080265045166, "step": 5705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08017800375819206, "epoch": 0.45648, "grad_norm": 0.0, "learning_rate": 2.1436016912779585e-06, "loss": 0.0, "step": 5706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.8359375, "completions/mean_terminated_length": 242.82022094726562, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.07951094210147858, "epoch": 0.45656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1431505680646857e-06, "loss": 0.0, "num_tokens": 262849934.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07741302251815796, "epoch": 0.45664, "grad_norm": 0.0, "learning_rate": 2.1426994173350254e-06, "loss": 0.0, "step": 5708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.921875, "completions/mean_terminated_length": 240.2244873046875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.06486102379858494, "epoch": 0.45672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1422482391205533e-06, "loss": 0.0, "num_tokens": 262946692.0, "reward": 0.08158833533525467, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08158833533525467, "rewards/reward_fn/std": 0.21671062707901, "step": 5709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06854598224163055, "epoch": 0.4568, "grad_norm": 0.0, "learning_rate": 2.1417970334528476e-06, "loss": 0.0, "step": 5710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.765625, "completions/mean_terminated_length": 224.01980590820312, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.07256746292114258, "epoch": 0.45688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.141345800363488e-06, "loss": 0.0, "num_tokens": 263041766.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07455478981137276, "epoch": 0.45696, "grad_norm": 0.0, "learning_rate": 2.140894539884056e-06, "loss": 0.0, "step": 5712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.8203125, "completions/mean_terminated_length": 209.99081420898438, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07670372724533081, "epoch": 0.45704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1404432520461346e-06, "loss": 0.0, "num_tokens": 263135055.0, "reward": 0.3923865556716919, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3923865556716919, "rewards/reward_fn/std": 0.9905130863189697, "step": 5713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07616851106286049, "epoch": 0.45712, "grad_norm": 0.0, "learning_rate": 2.139991936881309e-06, "loss": 0.0, "step": 5714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.0625, "completions/mean_terminated_length": 207.70730590820312, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.07170596346259117, "epoch": 0.4572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.139540594421167e-06, "loss": 0.0, "num_tokens": 263229399.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06852452456951141, "epoch": 0.45728, "grad_norm": 0.0, "learning_rate": 2.1390892246972977e-06, "loss": 0.0, "step": 5716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.9375, "completions/mean_terminated_length": 230.92681884765625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06512053310871124, "epoch": 0.45736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1386378277412926e-06, "loss": 0.0, "num_tokens": 263325647.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06488565355539322, "epoch": 0.45744, "grad_norm": 0.0, "learning_rate": 2.138186403584744e-06, "loss": 0.0, "step": 5718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.90625, "completions/mean_terminated_length": 200.7906951904297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06435985304415226, "epoch": 0.45752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1377349522592476e-06, "loss": 0.0, "num_tokens": 263419203.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06787854060530663, "epoch": 0.4576, "grad_norm": 0.0, "learning_rate": 2.1372834737964e-06, "loss": 0.0, "step": 5720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.0703125, "completions/mean_terminated_length": 222.87013244628906, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06811780482530594, "epoch": 0.45768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.136831968227799e-06, "loss": 0.0, "num_tokens": 263514956.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 5721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06566578894853592, "epoch": 0.45776, "grad_norm": 0.0, "learning_rate": 2.136380435585046e-06, "loss": 0.0, "step": 5722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.78125, "completions/mean_terminated_length": 242.9508056640625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06658412888646126, "epoch": 0.45784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.135928875899744e-06, "loss": 0.0, "num_tokens": 263612464.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06478744000196457, "epoch": 0.45792, "grad_norm": 0.0, "learning_rate": 2.1354772892034965e-06, "loss": 0.0, "step": 5724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 222.4225311279297, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06890636682510376, "epoch": 0.458, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1350256755279103e-06, "loss": 0.0, "num_tokens": 263708384.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07057343795895576, "epoch": 0.45808, "grad_norm": 0.0, "learning_rate": 2.134574034904593e-06, "loss": 0.0, "step": 5726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.8671875, "completions/mean_terminated_length": 238.2878875732422, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07233171910047531, "epoch": 0.45816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.134122367365156e-06, "loss": 0.0, "num_tokens": 263805519.0, "reward": 0.2013826072216034, "reward_std": 0.0, "rewards/reward_fn/mean": 0.2013826072216034, "rewards/reward_fn/std": 0.3515227437019348, "step": 5727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07341454178094864, "epoch": 0.45824, "grad_norm": 0.0, "learning_rate": 2.13367067294121e-06, "loss": 0.0, "step": 5728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.53125, "completions/mean_terminated_length": 211.94595336914062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.07148493826389313, "epoch": 0.45832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1332189516643686e-06, "loss": 0.0, "num_tokens": 263900563.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07279231026768684, "epoch": 0.4584, "grad_norm": 0.0, "learning_rate": 2.1327672035662486e-06, "loss": 0.0, "step": 5730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 198.703125, "completions/mean_terminated_length": 174.5111083984375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06073526665568352, "epoch": 0.45848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.132315428678467e-06, "loss": 0.0, "num_tokens": 263991533.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0627033170312643, "epoch": 0.45856, "grad_norm": 0.0, "learning_rate": 2.131863627032644e-06, "loss": 0.0, "step": 5732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.421875, "completions/mean_terminated_length": 220.4130401611328, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.06967969983816147, "epoch": 0.45864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1314117986603993e-06, "loss": 0.0, "num_tokens": 264086563.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06967379152774811, "epoch": 0.45872, "grad_norm": 0.0, "learning_rate": 2.130959943593358e-06, "loss": 0.0, "step": 5734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.2421875, "completions/mean_terminated_length": 209.2153778076172, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.07582451403141022, "epoch": 0.4588, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1305080618631445e-06, "loss": 0.0, "num_tokens": 264181826.0, "reward": 0.8315883278846741, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8315883278846741, "rewards/reward_fn/std": 1.2745213508605957, "step": 5735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07105086743831635, "epoch": 0.45888, "grad_norm": 0.0, "learning_rate": 2.1300561535013853e-06, "loss": 0.0, "step": 5736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.71875, "completions/mean_terminated_length": 241.4794464111328, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.07131941244006157, "epoch": 0.45896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1296042185397097e-06, "loss": 0.0, "num_tokens": 264279070.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07308651506900787, "epoch": 0.45904, "grad_norm": 0.0, "learning_rate": 2.1291522570097475e-06, "loss": 0.0, "step": 5738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.5078125, "completions/mean_terminated_length": 217.61538696289062, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.07457967475056648, "epoch": 0.45912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1287002689431324e-06, "loss": 0.0, "num_tokens": 264374879.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07258317992091179, "epoch": 0.4592, "grad_norm": 0.0, "learning_rate": 2.128248254371498e-06, "loss": 0.0, "step": 5740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.53125, "completions/mean_terminated_length": 221.65000915527344, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.08078157529234886, "epoch": 0.45928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1277962133264816e-06, "loss": 0.0, "num_tokens": 264470435.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08010346069931984, "epoch": 0.45936, "grad_norm": 0.0, "learning_rate": 2.12734414583972e-06, "loss": 0.0, "step": 5742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.1953125, "completions/mean_terminated_length": 204.7916717529297, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.07519292086362839, "epoch": 0.45944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1268920519428535e-06, "loss": 0.0, "num_tokens": 264565052.0, "reward": 0.38249102234840393, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38249102234840393, "rewards/reward_fn/std": 0.9934079051017761, "step": 5743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0764254778623581, "epoch": 0.45952, "grad_norm": 0.0, "learning_rate": 2.1264399316675244e-06, "loss": 0.0, "step": 5744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.9296875, "completions/mean_terminated_length": 234.5416717529297, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07290796563029289, "epoch": 0.4596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1259877850453757e-06, "loss": 0.0, "num_tokens": 264661811.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06932758912444115, "epoch": 0.45968, "grad_norm": 0.0, "learning_rate": 2.1255356121080533e-06, "loss": 0.0, "step": 5746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.0546875, "completions/mean_terminated_length": 238.76271057128906, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.07244593277573586, "epoch": 0.45976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.125083412887205e-06, "loss": 0.0, "num_tokens": 264759098.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06807876005768776, "epoch": 0.45984, "grad_norm": 0.0, "learning_rate": 2.124631187414479e-06, "loss": 0.0, "step": 5748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.2109375, "completions/mean_terminated_length": 227.98948669433594, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.06389430910348892, "epoch": 0.45992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.124178935721527e-06, "loss": 0.0, "num_tokens": 264854741.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06462664529681206, "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 2.1237266578400014e-06, "loss": 0.0, "step": 5750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.328125, "completions/mean_terminated_length": 217.64556884765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.0696931704878807, "epoch": 0.46008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.123274353801557e-06, "loss": 0.0, "num_tokens": 264950015.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0660937950015068, "epoch": 0.46016, "grad_norm": 0.0, "learning_rate": 2.1228220236378503e-06, "loss": 0.0, "step": 5752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.5859375, "completions/mean_terminated_length": 192.87208557128906, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.0699629969894886, "epoch": 0.46024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.12236966738054e-06, "loss": 0.0, "num_tokens": 265042890.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07366563379764557, "epoch": 0.46032, "grad_norm": 0.0, "learning_rate": 2.121917285061286e-06, "loss": 0.0, "step": 5754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.6640625, "completions/mean_terminated_length": 238.6483612060547, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.07622702792286873, "epoch": 0.4604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.12146487671175e-06, "loss": 0.0, "num_tokens": 265139615.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07465097308158875, "epoch": 0.46048, "grad_norm": 0.0, "learning_rate": 2.121012442363596e-06, "loss": 0.0, "step": 5756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.7265625, "completions/mean_terminated_length": 176.91836547851562, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06422814354300499, "epoch": 0.46056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1205599820484892e-06, "loss": 0.0, "num_tokens": 265234044.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06120280735194683, "epoch": 0.46064, "grad_norm": 0.0, "learning_rate": 2.1201074957980978e-06, "loss": 0.0, "step": 5758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.3203125, "completions/mean_terminated_length": 226.36471557617188, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.06859026849269867, "epoch": 0.46072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1196549836440916e-06, "loss": 0.0, "num_tokens": 265329829.0, "reward": 0.047493621706962585, "reward_std": 0.0, "rewards/reward_fn/mean": 0.047493621706962585, "rewards/reward_fn/std": 0.12615004181861877, "step": 5759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06891359016299248, "epoch": 0.4608, "grad_norm": 0.0, "learning_rate": 2.11920244561814e-06, "loss": 0.0, "step": 5760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.3671875, "completions/mean_terminated_length": 220.9666748046875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.06539348512887955, "epoch": 0.46088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.118749881751917e-06, "loss": 0.0, "num_tokens": 265424980.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06910606473684311, "epoch": 0.46096, "grad_norm": 0.0, "learning_rate": 2.118297292077097e-06, "loss": 0.0, "step": 5762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.40625, "completions/mean_terminated_length": 225.1836700439453, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.07447060942649841, "epoch": 0.46104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.117844676625356e-06, "loss": 0.0, "num_tokens": 265520264.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07408700883388519, "epoch": 0.46112, "grad_norm": 0.0, "learning_rate": 2.117392035428373e-06, "loss": 0.0, "step": 5764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 205.00970458984375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.07057095691561699, "epoch": 0.4612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1169393685178286e-06, "loss": 0.0, "num_tokens": 265613316.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07143183425068855, "epoch": 0.46128, "grad_norm": 0.0, "learning_rate": 2.116486675925404e-06, "loss": 0.0, "step": 5766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 228.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.07140787690877914, "epoch": 0.46136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1160339576827827e-06, "loss": 0.0, "num_tokens": 265708980.0, "reward": 0.3874585032463074, "reward_std": 0.0, "rewards/reward_fn/mean": 0.3874585032463074, "rewards/reward_fn/std": 0.9918686747550964, "step": 5767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0720260813832283, "epoch": 0.46144, "grad_norm": 0.0, "learning_rate": 2.11558121382165e-06, "loss": 0.0, "step": 5768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.6015625, "completions/mean_terminated_length": 233.5050506591797, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06478871777653694, "epoch": 0.46152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1151284443736937e-06, "loss": 0.0, "num_tokens": 265805057.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06698542460799217, "epoch": 0.4616, "grad_norm": 0.0, "learning_rate": 2.1146756493706022e-06, "loss": 0.0, "step": 5770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.6171875, "completions/mean_terminated_length": 215.92034912109375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.0641554519534111, "epoch": 0.46168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.114222828844068e-06, "loss": 0.0, "num_tokens": 265898832.0, "reward": 0.12279090285301208, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12279090285301208, "rewards/reward_fn/std": 0.32615068554878235, "step": 5771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06653378903865814, "epoch": 0.46176, "grad_norm": 0.0, "learning_rate": 2.1137699828257823e-06, "loss": 0.0, "step": 5772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.4140625, "completions/mean_terminated_length": 204.82022094726562, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.08167005330324173, "epoch": 0.46184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1133171113474403e-06, "loss": 0.0, "num_tokens": 265992581.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08065024763345718, "epoch": 0.46192, "grad_norm": 0.0, "learning_rate": 2.1128642144407376e-06, "loss": 0.0, "step": 5774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.7890625, "completions/mean_terminated_length": 232.6794891357422, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07723989337682724, "epoch": 0.462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1124112921373724e-06, "loss": 0.0, "num_tokens": 266089066.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07463009655475616, "epoch": 0.46208, "grad_norm": 0.0, "learning_rate": 2.111958344469044e-06, "loss": 0.0, "step": 5776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.3515625, "completions/mean_terminated_length": 238.52565002441406, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.06950423866510391, "epoch": 0.46216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.111505371467455e-06, "loss": 0.0, "num_tokens": 266186007.0, "reward": 0.49005722999572754, "reward_std": 0.0, "rewards/reward_fn/mean": 0.49005722999572754, "rewards/reward_fn/std": 0.975894570350647, "step": 5777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06781964376568794, "epoch": 0.46224, "grad_norm": 0.0, "learning_rate": 2.1110523731643086e-06, "loss": 0.0, "step": 5778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.8046875, "completions/mean_terminated_length": 239.48101806640625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.07517771050333977, "epoch": 0.46232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1105993495913093e-06, "loss": 0.0, "num_tokens": 266283006.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07535526156425476, "epoch": 0.4624, "grad_norm": 0.0, "learning_rate": 2.1101463007801637e-06, "loss": 0.0, "step": 5780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.2890625, "completions/mean_terminated_length": 219.23655700683594, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.06891797482967377, "epoch": 0.46248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1096932267625817e-06, "loss": 0.0, "num_tokens": 266377891.0, "reward": 0.02943696826696396, "reward_std": 0.0, "rewards/reward_fn/mean": 0.02943696826696396, "rewards/reward_fn/std": 0.07818891853094101, "step": 5781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07254563644528389, "epoch": 0.46256, "grad_norm": 0.0, "learning_rate": 2.109240127570272e-06, "loss": 0.0, "step": 5782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.4609375, "completions/mean_terminated_length": 194.07778930664062, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06862572580575943, "epoch": 0.46264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1087870032349483e-06, "loss": 0.0, "num_tokens": 266470622.0, "reward": 0.4067869484424591, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4067869484424591, "rewards/reward_fn/std": 0.9875356554985046, "step": 5783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06968590244650841, "epoch": 0.46272, "grad_norm": 0.0, "learning_rate": 2.108333853788324e-06, "loss": 0.0, "step": 5784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.5625, "completions/mean_terminated_length": 201.88888549804688, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06422654166817665, "epoch": 0.4628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1078806792621144e-06, "loss": 0.0, "num_tokens": 266565030.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06639540567994118, "epoch": 0.46288, "grad_norm": 0.0, "learning_rate": 2.1074274796880366e-06, "loss": 0.0, "step": 5786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.0546875, "completions/mean_terminated_length": 235.39706420898438, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.0752338171005249, "epoch": 0.46296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1069742550978113e-06, "loss": 0.0, "num_tokens": 266661933.0, "reward": 0.009978720918297768, "reward_std": 0.0, "rewards/reward_fn/mean": 0.009978720918297768, "rewards/reward_fn/std": 0.02650495432317257, "step": 5787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07716270536184311, "epoch": 0.46304, "grad_norm": 0.0, "learning_rate": 2.106521005523158e-06, "loss": 0.0, "step": 5788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.859375, "completions/mean_terminated_length": 239.04348754882812, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07143371179699898, "epoch": 0.46312, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1060677309958e-06, "loss": 0.0, "num_tokens": 266759067.0, "reward": 0.03178694099187851, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03178694099187851, "rewards/reward_fn/std": 0.08443079143762589, "step": 5789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07180742546916008, "epoch": 0.4632, "grad_norm": 0.0, "learning_rate": 2.1056144315474617e-06, "loss": 0.0, "step": 5790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.7421875, "completions/mean_terminated_length": 240.89999389648438, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.06512028351426125, "epoch": 0.46328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.105161107209869e-06, "loss": 0.0, "num_tokens": 266856314.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06329659186303616, "epoch": 0.46336, "grad_norm": 0.0, "learning_rate": 2.1047077580147497e-06, "loss": 0.0, "step": 5792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 219.9595947265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07492367923259735, "epoch": 0.46344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1042543839938345e-06, "loss": 0.0, "num_tokens": 266951050.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07633568346500397, "epoch": 0.46352, "grad_norm": 0.0, "learning_rate": 2.1038009851788537e-06, "loss": 0.0, "step": 5794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 171.421875, "completions/mean_terminated_length": 158.46847534179688, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06901831924915314, "epoch": 0.4636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1033475616015407e-06, "loss": 0.0, "num_tokens": 267038528.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06847037374973297, "epoch": 0.46368, "grad_norm": 0.0, "learning_rate": 2.102894113293631e-06, "loss": 0.0, "step": 5796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.8203125, "completions/mean_terminated_length": 192.44085693359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06468783132731915, "epoch": 0.46376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.10244064028686e-06, "loss": 0.0, "num_tokens": 267130921.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0653378963470459, "epoch": 0.46384, "grad_norm": 0.0, "learning_rate": 2.101987142612967e-06, "loss": 0.0, "step": 5798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.0390625, "completions/mean_terminated_length": 223.70330810546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.0730169489979744, "epoch": 0.46392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.101533620303691e-06, "loss": 0.0, "num_tokens": 267226286.0, "reward": 0.03641407564282417, "reward_std": 0.0, "rewards/reward_fn/mean": 0.03641407564282417, "rewards/reward_fn/std": 0.09672114253044128, "step": 5799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07564475014805794, "epoch": 0.464, "grad_norm": 0.0, "learning_rate": 2.101080073390775e-06, "loss": 0.0, "step": 5800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 245.8984375, "completions/mean_terminated_length": 229.61224365234375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.06829744949936867, "epoch": 0.46408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.100626501905962e-06, "loss": 0.0, "num_tokens": 267323297.0, "reward": 0.45800459384918213, "reward_std": 0.0, "rewards/reward_fn/mean": 0.45800459384918213, "rewards/reward_fn/std": 0.9889340400695801, "step": 5801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07007109001278877, "epoch": 0.46416, "grad_norm": 0.0, "learning_rate": 2.100172905880998e-06, "loss": 0.0, "step": 5802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.234375, "completions/mean_terminated_length": 214.51019287109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07272176444530487, "epoch": 0.46424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0997192853476278e-06, "loss": 0.0, "num_tokens": 267417535.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0738779678940773, "epoch": 0.46432, "grad_norm": 0.0, "learning_rate": 2.0992656403376018e-06, "loss": 0.0, "step": 5804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.65625, "completions/mean_terminated_length": 205.43478393554688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06698507070541382, "epoch": 0.4644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.09881197088267e-06, "loss": 0.0, "num_tokens": 267511187.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06785311177372932, "epoch": 0.46448, "grad_norm": 0.0, "learning_rate": 2.098358277014584e-06, "loss": 0.0, "step": 5806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.0078125, "completions/mean_terminated_length": 193.5731658935547, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06540467590093613, "epoch": 0.46456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0979045587650987e-06, "loss": 0.0, "num_tokens": 267604372.0, "reward": 0.4347124993801117, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4347124993801117, "rewards/reward_fn/std": 0.9859739542007446, "step": 5807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06349189206957817, "epoch": 0.46464, "grad_norm": 0.0, "learning_rate": 2.0974508161659686e-06, "loss": 0.0, "step": 5808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 236.09375, "completions/mean_terminated_length": 222.90908813476562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.06912878900766373, "epoch": 0.46472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.096997049248951e-06, "loss": 0.0, "num_tokens": 267700128.0, "reward": 0.05776464566588402, "reward_std": 0.0, "rewards/reward_fn/mean": 0.05776464566588402, "rewards/reward_fn/std": 0.15343140065670013, "step": 5809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06878039613366127, "epoch": 0.4648, "grad_norm": 0.0, "learning_rate": 2.0965432580458048e-06, "loss": 0.0, "step": 5810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.5859375, "completions/mean_terminated_length": 212.2584228515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06839488074183464, "epoch": 0.46488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.096089442588291e-06, "loss": 0.0, "num_tokens": 267794539.0, "reward": 0.0722954273223877, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0722954273223877, "rewards/reward_fn/std": 0.192027285695076, "step": 5811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06334120035171509, "epoch": 0.46496, "grad_norm": 0.0, "learning_rate": 2.095635602908171e-06, "loss": 0.0, "step": 5812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 213.8828125, "completions/mean_terminated_length": 202.08999633789062, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.06970587000250816, "epoch": 0.46504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0951817390372104e-06, "loss": 0.0, "num_tokens": 267887452.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06631570309400558, "epoch": 0.46512, "grad_norm": 0.0, "learning_rate": 2.094727851007173e-06, "loss": 0.0, "step": 5814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.15625, "completions/mean_terminated_length": 223.18280029296875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.07601646333932877, "epoch": 0.4652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0942739388498274e-06, "loss": 0.0, "num_tokens": 267982704.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07753558829426765, "epoch": 0.46528, "grad_norm": 0.0, "learning_rate": 2.0938200025969423e-06, "loss": 0.0, "step": 5816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 205.421875, "completions/mean_terminated_length": 181.58621215820312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07637351006269455, "epoch": 0.46536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0933660422802883e-06, "loss": 0.0, "num_tokens": 268074534.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07391863316297531, "epoch": 0.46544, "grad_norm": 0.0, "learning_rate": 2.0929120579316383e-06, "loss": 0.0, "step": 5818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.578125, "completions/mean_terminated_length": 211.69412231445312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.0699627436697483, "epoch": 0.46552, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.092458049582766e-06, "loss": 0.0, "num_tokens": 268169072.0, "reward": 0.18926793336868286, "reward_std": 0.0, "rewards/reward_fn/mean": 0.18926793336868286, "rewards/reward_fn/std": 0.32917243242263794, "step": 5819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06613923981785774, "epoch": 0.4656, "grad_norm": 0.0, "learning_rate": 2.092004017265447e-06, "loss": 0.0, "step": 5820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.3359375, "completions/mean_terminated_length": 214.728271484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.06591043993830681, "epoch": 0.46568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0915499610114596e-06, "loss": 0.0, "num_tokens": 268263579.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06452620774507523, "epoch": 0.46576, "grad_norm": 0.0, "learning_rate": 2.091095880852582e-06, "loss": 0.0, "step": 5822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.4296875, "completions/mean_terminated_length": 241.4166717529297, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.06437109410762787, "epoch": 0.46584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.090641776820595e-06, "loss": 0.0, "num_tokens": 268360658.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06391197443008423, "epoch": 0.46592, "grad_norm": 0.0, "learning_rate": 2.090187648947282e-06, "loss": 0.0, "step": 5824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.328125, "completions/mean_terminated_length": 226.8000030517578, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.07125847041606903, "epoch": 0.466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0897334972644263e-06, "loss": 0.0, "num_tokens": 268456188.0, "reward": 0.872876763343811, "reward_std": 0.0, "rewards/reward_fn/mean": 0.872876763343811, "rewards/reward_fn/std": 1.2733986377716064, "step": 5825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07240188866853714, "epoch": 0.46608, "grad_norm": 0.0, "learning_rate": 2.0892793218038147e-06, "loss": 0.0, "step": 5826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.421875, "completions/mean_terminated_length": 239.1599884033203, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.0651739314198494, "epoch": 0.46616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0888251225972333e-06, "loss": 0.0, "num_tokens": 268553650.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06557367369532585, "epoch": 0.46624, "grad_norm": 0.0, "learning_rate": 2.0883708996764725e-06, "loss": 0.0, "step": 5828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.3125, "completions/mean_terminated_length": 212.4615478515625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.07145486399531364, "epoch": 0.46632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.087916653073322e-06, "loss": 0.0, "num_tokens": 268649690.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07240170612931252, "epoch": 0.4664, "grad_norm": 0.0, "learning_rate": 2.087462382819575e-06, "loss": 0.0, "step": 5830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.1640625, "completions/mean_terminated_length": 220.35365295410156, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.06983688473701477, "epoch": 0.46648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.087008088947026e-06, "loss": 0.0, "num_tokens": 268745071.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06741859763860703, "epoch": 0.46656, "grad_norm": 0.0, "learning_rate": 2.08655377148747e-06, "loss": 0.0, "step": 5832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.7265625, "completions/mean_terminated_length": 243.08535766601562, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.07251253724098206, "epoch": 0.46664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0860994304727046e-06, "loss": 0.0, "num_tokens": 268842316.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07419117540121078, "epoch": 0.46672, "grad_norm": 0.0, "learning_rate": 2.085645065934529e-06, "loss": 0.0, "step": 5834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.5078125, "completions/mean_terminated_length": 227.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.0679919607937336, "epoch": 0.4668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0851906779047438e-06, "loss": 0.0, "num_tokens": 268937485.0, "reward": 0.384978711605072, "reward_std": 0.0, "rewards/reward_fn/mean": 0.384978711605072, "rewards/reward_fn/std": 0.9926154613494873, "step": 5835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06915033608675003, "epoch": 0.46688, "grad_norm": 0.0, "learning_rate": 2.0847362664151514e-06, "loss": 0.0, "step": 5836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.546875, "completions/mean_terminated_length": 225.3972625732422, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.06684018298983574, "epoch": 0.46696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0842818314975556e-06, "loss": 0.0, "num_tokens": 269033555.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696161799132824, "epoch": 0.46704, "grad_norm": 0.0, "learning_rate": 2.0838273731837633e-06, "loss": 0.0, "step": 5838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.2421875, "completions/mean_terminated_length": 239.26864624023438, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.07241114974021912, "epoch": 0.46712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0833728915055794e-06, "loss": 0.0, "num_tokens": 269130738.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07054222747683525, "epoch": 0.4672, "grad_norm": 0.0, "learning_rate": 2.082918386494815e-06, "loss": 0.0, "step": 5840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 238.51852416992188, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.06517892330884933, "epoch": 0.46728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.08246385818328e-06, "loss": 0.0, "num_tokens": 269227154.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0646137036383152, "epoch": 0.46736, "grad_norm": 0.0, "learning_rate": 2.082009306602786e-06, "loss": 0.0, "step": 5842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 207.640625, "completions/mean_terminated_length": 198.14952087402344, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06881137564778328, "epoch": 0.46744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.081554731785147e-06, "loss": 0.0, "num_tokens": 269319268.0, "reward": 0.7886883616447449, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7886883616447449, "rewards/reward_fn/std": 1.2856351137161255, "step": 5843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06997937709093094, "epoch": 0.46752, "grad_norm": 0.0, "learning_rate": 2.081100133762179e-06, "loss": 0.0, "step": 5844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.015625, "completions/mean_terminated_length": 228.03125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06135190650820732, "epoch": 0.4676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.080645512565699e-06, "loss": 0.0, "num_tokens": 269415782.0, "reward": 0.625, "reward_std": 0.0, "rewards/reward_fn/mean": 0.625, "rewards/reward_fn/std": 1.1153898239135742, "step": 5845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059447651728987694, "epoch": 0.46768, "grad_norm": 0.0, "learning_rate": 2.080190868227525e-06, "loss": 0.0, "step": 5846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.234375, "completions/mean_terminated_length": 213.15492248535156, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06708276644349098, "epoch": 0.46776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.079736200779477e-06, "loss": 0.0, "num_tokens": 269511044.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06655248999595642, "epoch": 0.46784, "grad_norm": 0.0, "learning_rate": 2.079281510253379e-06, "loss": 0.0, "step": 5848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.8203125, "completions/mean_terminated_length": 206.42709350585938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.07292263954877853, "epoch": 0.46792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0788267966810523e-06, "loss": 0.0, "num_tokens": 269604589.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07264771312475204, "epoch": 0.468, "grad_norm": 0.0, "learning_rate": 2.0783720600943236e-06, "loss": 0.0, "step": 5850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.546875, "completions/mean_terminated_length": 232.0681915283203, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.0685361996293068, "epoch": 0.46808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0779173005250182e-06, "loss": 0.0, "num_tokens": 269700787.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06975237280130386, "epoch": 0.46816, "grad_norm": 0.0, "learning_rate": 2.0774625180049655e-06, "loss": 0.0, "step": 5852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 200.86956787109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07187622040510178, "epoch": 0.46824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0770077125659953e-06, "loss": 0.0, "num_tokens": 269794019.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06788206472992897, "epoch": 0.46832, "grad_norm": 0.0, "learning_rate": 2.0765528842399385e-06, "loss": 0.0, "step": 5854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.578125, "completions/mean_terminated_length": 219.76577758789062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.06798586249351501, "epoch": 0.4684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0760980330586294e-06, "loss": 0.0, "num_tokens": 269888301.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06636271625757217, "epoch": 0.46848, "grad_norm": 0.0, "learning_rate": 2.0756431590539027e-06, "loss": 0.0, "step": 5856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.515625, "completions/mean_terminated_length": 228.1063690185547, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.07197530195116997, "epoch": 0.46856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0751882622575933e-06, "loss": 0.0, "num_tokens": 269983983.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07541139796376228, "epoch": 0.46864, "grad_norm": 0.0, "learning_rate": 2.074733342701541e-06, "loss": 0.0, "step": 5858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.8046875, "completions/mean_terminated_length": 215.6666717529297, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06520727276802063, "epoch": 0.46872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.074278400417584e-06, "loss": 0.0, "num_tokens": 270078294.0, "reward": 0.06162349507212639, "reward_std": 0.0, "rewards/reward_fn/mean": 0.06162349507212639, "rewards/reward_fn/std": 0.16368108987808228, "step": 5859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06404006853699684, "epoch": 0.4688, "grad_norm": 0.0, "learning_rate": 2.073823435437564e-06, "loss": 0.0, "step": 5860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.1484375, "completions/mean_terminated_length": 237.09194946289062, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.07539128512144089, "epoch": 0.46888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.073368447793324e-06, "loss": 0.0, "num_tokens": 270174953.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07430354505777359, "epoch": 0.46896, "grad_norm": 0.0, "learning_rate": 2.0729134375167083e-06, "loss": 0.0, "step": 5862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.765625, "completions/mean_terminated_length": 238.60000610351562, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.0692509263753891, "epoch": 0.46904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0724584046395623e-06, "loss": 0.0, "num_tokens": 270271691.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06733416393399239, "epoch": 0.46912, "grad_norm": 0.0, "learning_rate": 2.072003349193734e-06, "loss": 0.0, "step": 5864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.796875, "completions/mean_terminated_length": 223.01695251464844, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.07741504162549973, "epoch": 0.4692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0715482712110716e-06, "loss": 0.0, "num_tokens": 270368049.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07514471188187599, "epoch": 0.46928, "grad_norm": 0.0, "learning_rate": 2.0710931707234267e-06, "loss": 0.0, "step": 5866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.40625, "completions/mean_terminated_length": 223.86666870117188, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06782912462949753, "epoch": 0.46936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0706380477626517e-06, "loss": 0.0, "num_tokens": 270463461.0, "reward": 0.004997334908694029, "reward_std": 0.0, "rewards/reward_fn/mean": 0.004997334908694029, "rewards/reward_fn/std": 0.013273656368255615, "step": 5867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06924230605363846, "epoch": 0.46944, "grad_norm": 0.0, "learning_rate": 2.0701829023606e-06, "loss": 0.0, "step": 5868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.90625, "completions/mean_terminated_length": 183.54782104492188, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06997519731521606, "epoch": 0.46952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.069727734549126e-06, "loss": 0.0, "num_tokens": 270553433.0, "reward": 1.5, "reward_std": 0.0, "rewards/reward_fn/mean": 1.5, "rewards/reward_fn/std": 1.5058939456939697, "step": 5869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07451723515987396, "epoch": 0.4696, "grad_norm": 0.0, "learning_rate": 2.069272544360088e-06, "loss": 0.0, "step": 5870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 246.40625, "completions/mean_terminated_length": 235.1864471435547, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07333661988377571, "epoch": 0.46968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.068817331825344e-06, "loss": 0.0, "num_tokens": 270650509.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06888672709465027, "epoch": 0.46976, "grad_norm": 0.0, "learning_rate": 2.068362096976754e-06, "loss": 0.0, "step": 5872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5078125, "completions/mean_terminated_length": 235.5566864013672, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.07579582929611206, "epoch": 0.46984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.06790683984618e-06, "loss": 0.0, "num_tokens": 270746830.0, "reward": 0.04961630329489708, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04961630329489708, "rewards/reward_fn/std": 0.1317882090806961, "step": 5873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07474849000573158, "epoch": 0.46992, "grad_norm": 0.0, "learning_rate": 2.067451560465485e-06, "loss": 0.0, "step": 5874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.7421875, "completions/mean_terminated_length": 238.56666564941406, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.07317576929926872, "epoch": 0.47, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.066996258866534e-06, "loss": 0.0, "num_tokens": 270843565.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0705416388809681, "epoch": 0.47008, "grad_norm": 0.0, "learning_rate": 2.0665409350811927e-06, "loss": 0.0, "step": 5876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 240.72727966308594, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.07238442450761795, "epoch": 0.47016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.066085589141329e-06, "loss": 0.0, "num_tokens": 270940861.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07572202384471893, "epoch": 0.47024, "grad_norm": 0.0, "learning_rate": 2.065630221078812e-06, "loss": 0.0, "step": 5878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8671875, "completions/mean_terminated_length": 226.85848999023438, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06879361718893051, "epoch": 0.47032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.065174830925514e-06, "loss": 0.0, "num_tokens": 271036076.0, "reward": 0.08158833533525467, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08158833533525467, "rewards/reward_fn/std": 0.21671062707901, "step": 5879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07201656326651573, "epoch": 0.4704, "grad_norm": 0.0, "learning_rate": 2.0647194187133066e-06, "loss": 0.0, "step": 5880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.421875, "completions/mean_terminated_length": 222.3170623779297, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07639692723751068, "epoch": 0.47048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0642639844740645e-06, "loss": 0.0, "num_tokens": 271131618.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07857085019350052, "epoch": 0.47056, "grad_norm": 0.0, "learning_rate": 2.0638085282396625e-06, "loss": 0.0, "step": 5882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.828125, "completions/mean_terminated_length": 202.53932189941406, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.06722943857312202, "epoch": 0.47064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.063353050041977e-06, "loss": 0.0, "num_tokens": 271225164.0, "reward": 0.1251097172498703, "reward_std": 0.0, "rewards/reward_fn/mean": 0.1251097172498703, "rewards/reward_fn/std": 0.24494943022727966, "step": 5883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668744407594204, "epoch": 0.47072, "grad_norm": 0.0, "learning_rate": 2.0628975499128886e-06, "loss": 0.0, "step": 5884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 201.27536010742188, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.06916020065546036, "epoch": 0.4708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0624420278842757e-06, "loss": 0.0, "num_tokens": 271319692.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06634362041950226, "epoch": 0.47088, "grad_norm": 0.0, "learning_rate": 2.061986483988022e-06, "loss": 0.0, "step": 5886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 203.640625, "completions/mean_terminated_length": 193.36448669433594, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.06732083112001419, "epoch": 0.47096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0615309182560087e-06, "loss": 0.0, "num_tokens": 271411294.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06611666083335876, "epoch": 0.47104, "grad_norm": 0.0, "learning_rate": 2.061075330720122e-06, "loss": 0.0, "step": 5888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.4609375, "completions/mean_terminated_length": 213.51087951660156, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.07930901646614075, "epoch": 0.47112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0606197214122476e-06, "loss": 0.0, "num_tokens": 271505689.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07925358787178993, "epoch": 0.4712, "grad_norm": 0.0, "learning_rate": 2.0601640903642735e-06, "loss": 0.0, "step": 5890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.0859375, "completions/mean_terminated_length": 200.51515197753906, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.07037360593676567, "epoch": 0.47128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0597084376080883e-06, "loss": 0.0, "num_tokens": 271598500.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0655549019575119, "epoch": 0.47136, "grad_norm": 0.0, "learning_rate": 2.059252763175585e-06, "loss": 0.0, "step": 5892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 215.0370330810547, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.07574334740638733, "epoch": 0.47144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0587970670986537e-06, "loss": 0.0, "num_tokens": 271692380.0, "reward": 0.4759461283683777, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4759461283683777, "rewards/reward_fn/std": 0.9938373565673828, "step": 5893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07877781242132187, "epoch": 0.47152, "grad_norm": 0.0, "learning_rate": 2.058341349409189e-06, "loss": 0.0, "step": 5894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.078125, "completions/mean_terminated_length": 214.8000030517578, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.06907908618450165, "epoch": 0.4716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0578856101390874e-06, "loss": 0.0, "num_tokens": 271788006.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06867606937885284, "epoch": 0.47168, "grad_norm": 0.0, "learning_rate": 2.0574298493202443e-06, "loss": 0.0, "step": 5896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.734375, "completions/mean_terminated_length": 220.4615478515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07148021459579468, "epoch": 0.47176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.056974066984559e-06, "loss": 0.0, "num_tokens": 271883076.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045438140630722, "epoch": 0.47184, "grad_norm": 0.0, "learning_rate": 2.0565182631639314e-06, "loss": 0.0, "step": 5898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.3046875, "completions/mean_terminated_length": 235.61627197265625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.07566724717617035, "epoch": 0.47192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0560624378902627e-06, "loss": 0.0, "num_tokens": 271979627.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07595798373222351, "epoch": 0.472, "grad_norm": 0.0, "learning_rate": 2.0556065911954555e-06, "loss": 0.0, "step": 5900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.3828125, "completions/mean_terminated_length": 220.0086212158203, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.061819206923246384, "epoch": 0.47208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0551507231114152e-06, "loss": 0.0, "num_tokens": 272073756.0, "reward": 0.7817869186401367, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7817869186401367, "rewards/reward_fn/std": 1.2883555889129639, "step": 5901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06123238056898117, "epoch": 0.47216, "grad_norm": 0.0, "learning_rate": 2.0546948336700477e-06, "loss": 0.0, "step": 5902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.3203125, "completions/mean_terminated_length": 230.640625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.07335037365555763, "epoch": 0.47224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0542389229032595e-06, "loss": 0.0, "num_tokens": 272170437.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07470303401350975, "epoch": 0.47232, "grad_norm": 0.0, "learning_rate": 2.0537829908429605e-06, "loss": 0.0, "step": 5904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.796875, "completions/mean_terminated_length": 236.7954559326172, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.06892842426896095, "epoch": 0.4724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0533270375210604e-06, "loss": 0.0, "num_tokens": 272267051.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06518708541989326, "epoch": 0.47248, "grad_norm": 0.0, "learning_rate": 2.0528710629694713e-06, "loss": 0.0, "step": 5906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.2578125, "completions/mean_terminated_length": 225.72000122070312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.06477416679263115, "epoch": 0.47256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0524150672201068e-06, "loss": 0.0, "num_tokens": 272363084.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06494451314210892, "epoch": 0.47264, "grad_norm": 0.0, "learning_rate": 2.0519590503048818e-06, "loss": 0.0, "step": 5908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.34375, "completions/mean_terminated_length": 224.57894897460938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.07128269597887993, "epoch": 0.47272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.051503012255712e-06, "loss": 0.0, "num_tokens": 272459000.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07292592152953148, "epoch": 0.4728, "grad_norm": 0.0, "learning_rate": 2.051046953104517e-06, "loss": 0.0, "step": 5910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.8203125, "completions/mean_terminated_length": 216.10714721679688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.06570330634713173, "epoch": 0.47288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.050590872883214e-06, "loss": 0.0, "num_tokens": 272553953.0, "reward": 0.08703220635652542, "reward_std": 0.0, "rewards/reward_fn/mean": 0.08703220635652542, "rewards/reward_fn/std": 0.23117035627365112, "step": 5911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06111699342727661, "epoch": 0.47296, "grad_norm": 0.0, "learning_rate": 2.050134771623725e-06, "loss": 0.0, "step": 5912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.171875, "completions/mean_terminated_length": 223.39999389648438, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.07455368340015411, "epoch": 0.47304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0496786493579718e-06, "loss": 0.0, "num_tokens": 272649975.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07506502419710159, "epoch": 0.47312, "grad_norm": 0.0, "learning_rate": 2.0492225061178785e-06, "loss": 0.0, "step": 5914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 243.3125, "completions/mean_terminated_length": 236.43373107910156, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.06630772724747658, "epoch": 0.4732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0487663419353704e-06, "loss": 0.0, "num_tokens": 272746655.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06014326959848404, "epoch": 0.47328, "grad_norm": 0.0, "learning_rate": 2.0483101568423738e-06, "loss": 0.0, "step": 5916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.953125, "completions/mean_terminated_length": 210.02174377441406, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07202295586466789, "epoch": 0.47336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0478539508708167e-06, "loss": 0.0, "num_tokens": 272840729.0, "reward": 0.4722360074520111, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4722360074520111, "rewards/reward_fn/std": 0.9926378130912781, "step": 5917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07174569740891457, "epoch": 0.47344, "grad_norm": 0.0, "learning_rate": 2.047397724052629e-06, "loss": 0.0, "step": 5918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.4921875, "completions/mean_terminated_length": 200.84042358398438, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07461128383874893, "epoch": 0.47352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.046941476419742e-06, "loss": 0.0, "num_tokens": 272933848.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07358986884355545, "epoch": 0.4736, "grad_norm": 0.0, "learning_rate": 2.0464852080040877e-06, "loss": 0.0, "step": 5920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.390625, "completions/mean_terminated_length": 208.03175354003906, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.07233140990138054, "epoch": 0.47368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.046028918837601e-06, "loss": 0.0, "num_tokens": 273029130.0, "reward": 0.37749966979026794, "reward_std": 0.0, "rewards/reward_fn/mean": 0.37749966979026794, "rewards/reward_fn/std": 0.9951284527778625, "step": 5921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0678025558590889, "epoch": 0.47376, "grad_norm": 0.0, "learning_rate": 2.045572608952216e-06, "loss": 0.0, "step": 5922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.171875, "completions/mean_terminated_length": 223.8850555419922, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06799216568470001, "epoch": 0.47384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0451162783798702e-06, "loss": 0.0, "num_tokens": 273124640.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06957617029547691, "epoch": 0.47392, "grad_norm": 0.0, "learning_rate": 2.044659927152502e-06, "loss": 0.0, "step": 5924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.3203125, "completions/mean_terminated_length": 229.59210205078125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.0712638571858406, "epoch": 0.474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0442035553020512e-06, "loss": 0.0, "num_tokens": 273220937.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06851991638541222, "epoch": 0.47408, "grad_norm": 0.0, "learning_rate": 2.043747162860458e-06, "loss": 0.0, "step": 5926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 212.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.07325968518853188, "epoch": 0.47416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.043290749859667e-06, "loss": 0.0, "num_tokens": 273315897.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0721978023648262, "epoch": 0.47424, "grad_norm": 0.0, "learning_rate": 2.0428343163316216e-06, "loss": 0.0, "step": 5928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.9453125, "completions/mean_terminated_length": 232.41836547851562, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.07195140421390533, "epoch": 0.47432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0423778623082667e-06, "loss": 0.0, "num_tokens": 273411890.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07485218346118927, "epoch": 0.4744, "grad_norm": 0.0, "learning_rate": 2.041921387821549e-06, "loss": 0.0, "step": 5930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.9453125, "completions/mean_terminated_length": 216.29762268066406, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.0883852168917656, "epoch": 0.47448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0414648929034174e-06, "loss": 0.0, "num_tokens": 273506859.0, "reward": 0.8315883278846741, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8315883278846741, "rewards/reward_fn/std": 1.2745213508605957, "step": 5931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0843927338719368, "epoch": 0.47456, "grad_norm": 0.0, "learning_rate": 2.041008377585822e-06, "loss": 0.0, "step": 5932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.265625, "completions/mean_terminated_length": 212.7692413330078, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.067537110298872, "epoch": 0.47464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0405518419007138e-06, "loss": 0.0, "num_tokens": 273601229.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06484977155923843, "epoch": 0.47472, "grad_norm": 0.0, "learning_rate": 2.040095285880046e-06, "loss": 0.0, "step": 5934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.9140625, "completions/mean_terminated_length": 208.79310607910156, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.06725486367940903, "epoch": 0.4748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.039638709555772e-06, "loss": 0.0, "num_tokens": 273695426.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06969350203871727, "epoch": 0.47488, "grad_norm": 0.0, "learning_rate": 2.039182112959847e-06, "loss": 0.0, "step": 5936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.609375, "completions/mean_terminated_length": 200.98851013183594, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.07200538367033005, "epoch": 0.47496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.038725496124228e-06, "loss": 0.0, "num_tokens": 273788944.0, "reward": 0.4020647704601288, "reward_std": 0.0, "rewards/reward_fn/mean": 0.4020647704601288, "rewards/reward_fn/std": 0.9883498549461365, "step": 5937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07362377271056175, "epoch": 0.47504, "grad_norm": 0.0, "learning_rate": 2.0382688590808745e-06, "loss": 0.0, "step": 5938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.5390625, "completions/mean_terminated_length": 237.43038940429688, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.06812471523880959, "epoch": 0.47512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0378122018617454e-06, "loss": 0.0, "num_tokens": 273885781.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06960412114858627, "epoch": 0.4752, "grad_norm": 0.0, "learning_rate": 2.0373555244988022e-06, "loss": 0.0, "step": 5940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.3203125, "completions/mean_terminated_length": 230.22413635253906, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.07265928760170937, "epoch": 0.47528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.036898827024007e-06, "loss": 0.0, "num_tokens": 273982590.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07047685980796814, "epoch": 0.47536, "grad_norm": 0.0, "learning_rate": 2.0364421094693247e-06, "loss": 0.0, "step": 5942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.8203125, "completions/mean_terminated_length": 192.44085693359375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07345303893089294, "epoch": 0.47544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0359853718667197e-06, "loss": 0.0, "num_tokens": 274074983.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0764889344573021, "epoch": 0.47552, "grad_norm": 0.0, "learning_rate": 2.035528614248159e-06, "loss": 0.0, "step": 5944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.4921875, "completions/mean_terminated_length": 232.64706420898438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.0705735832452774, "epoch": 0.4756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.035071836645611e-06, "loss": 0.0, "num_tokens": 274171302.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0696425549685955, "epoch": 0.47568, "grad_norm": 0.0, "learning_rate": 2.0346150390910463e-06, "loss": 0.0, "step": 5946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 236.8515625, "completions/mean_terminated_length": 226.1097412109375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.06252817809581757, "epoch": 0.47576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0341582216164345e-06, "loss": 0.0, "num_tokens": 274267155.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06418973207473755, "epoch": 0.47584, "grad_norm": 0.0, "learning_rate": 2.0337013842537483e-06, "loss": 0.0, "step": 5948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.734375, "completions/mean_terminated_length": 221.10447692871094, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.06534526497125626, "epoch": 0.47592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.033244527034962e-06, "loss": 0.0, "num_tokens": 274363121.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06319045647978783, "epoch": 0.476, "grad_norm": 0.0, "learning_rate": 2.03278764999205e-06, "loss": 0.0, "step": 5950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.4921875, "completions/mean_terminated_length": 234.3874969482422, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.07025954127311707, "epoch": 0.47608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0323307531569896e-06, "loss": 0.0, "num_tokens": 274459696.0, "reward": 0.012458499521017075, "reward_std": 0.0, "rewards/reward_fn/mean": 0.012458499521017075, "rewards/reward_fn/std": 0.03309160843491554, "step": 5951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06868845596909523, "epoch": 0.47616, "grad_norm": 0.0, "learning_rate": 2.031873836561758e-06, "loss": 0.0, "step": 5952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.0078125, "completions/mean_terminated_length": 199.90411376953125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.08214307948946953, "epoch": 0.47624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0314169002383356e-06, "loss": 0.0, "num_tokens": 274553905.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07959119975566864, "epoch": 0.47632, "grad_norm": 0.0, "learning_rate": 2.030959944218702e-06, "loss": 0.0, "step": 5954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.4765625, "completions/mean_terminated_length": 231.2133331298828, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.07414264976978302, "epoch": 0.4764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.03050296853484e-06, "loss": 0.0, "num_tokens": 274650350.0, "reward": 1.125, "reward_std": 0.0, "rewards/reward_fn/mean": 1.125, "rewards/reward_fn/std": 1.4580755233764648, "step": 5955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07770256698131561, "epoch": 0.47648, "grad_norm": 0.0, "learning_rate": 2.0300459732187334e-06, "loss": 0.0, "step": 5956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.9921875, "completions/mean_terminated_length": 191.19752502441406, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.061882730573415756, "epoch": 0.47656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0295889583023653e-06, "loss": 0.0, "num_tokens": 274743405.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06139262393116951, "epoch": 0.47664, "grad_norm": 0.0, "learning_rate": 2.0291319238177244e-06, "loss": 0.0, "step": 5958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.578125, "completions/mean_terminated_length": 232.90110778808594, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.06837749481201172, "epoch": 0.47672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0286748697967962e-06, "loss": 0.0, "num_tokens": 274839607.0, "reward": 0.8188909888267517, "reward_std": 0.0, "rewards/reward_fn/mean": 0.8188909888267517, "rewards/reward_fn/std": 1.276761770248413, "step": 5959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06777458265423775, "epoch": 0.4768, "grad_norm": 0.0, "learning_rate": 2.0282177962715707e-06, "loss": 0.0, "step": 5960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.46875, "completions/mean_terminated_length": 238.8372039794922, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.0690990462899208, "epoch": 0.47688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.027760703274038e-06, "loss": 0.0, "num_tokens": 274936435.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06941535323858261, "epoch": 0.47696, "grad_norm": 0.0, "learning_rate": 2.0273035908361902e-06, "loss": 0.0, "step": 5962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.03125, "completions/mean_terminated_length": 215.13401794433594, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.06895426288247108, "epoch": 0.47704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0268464589900193e-06, "loss": 0.0, "num_tokens": 275030775.0, "reward": 0.38992840051651, "reward_std": 0.0, "rewards/reward_fn/mean": 0.38992840051651, "rewards/reward_fn/std": 0.9911679029464722, "step": 5963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06993519514799118, "epoch": 0.47712, "grad_norm": 0.0, "learning_rate": 2.0263893077675207e-06, "loss": 0.0, "step": 5964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.1953125, "completions/mean_terminated_length": 228.42308044433594, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.07532628253102303, "epoch": 0.4772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0259321372006896e-06, "loss": 0.0, "num_tokens": 275126928.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07445838302373886, "epoch": 0.47728, "grad_norm": 0.0, "learning_rate": 2.0254749473215233e-06, "loss": 0.0, "step": 5966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.7890625, "completions/mean_terminated_length": 201.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.07058752700686455, "epoch": 0.47736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.02501773816202e-06, "loss": 0.0, "num_tokens": 275220469.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07045991346240044, "epoch": 0.47744, "grad_norm": 0.0, "learning_rate": 2.02456050975418e-06, "loss": 0.0, "step": 5968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.0234375, "completions/mean_terminated_length": 218.5731658935547, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.0655268244445324, "epoch": 0.47752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0241032621300047e-06, "loss": 0.0, "num_tokens": 275315704.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06538336351513863, "epoch": 0.4776, "grad_norm": 0.0, "learning_rate": 2.023645995321496e-06, "loss": 0.0, "step": 5970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.6796875, "completions/mean_terminated_length": 203.34568786621094, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.07639222964644432, "epoch": 0.47768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0231887093606577e-06, "loss": 0.0, "num_tokens": 275409743.0, "reward": 0.04533843323588371, "reward_std": 0.0, "rewards/reward_fn/mean": 0.04533843323588371, "rewards/reward_fn/std": 0.12042555958032608, "step": 5971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07851898670196533, "epoch": 0.47776, "grad_norm": 0.0, "learning_rate": 2.0227314042794947e-06, "loss": 0.0, "step": 5972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4921875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.6015625, "completions/mean_terminated_length": 241.43077087402344, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.07129128649830818, "epoch": 0.47784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.022274080110014e-06, "loss": 0.0, "num_tokens": 275507100.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06991109997034073, "epoch": 0.47792, "grad_norm": 0.0, "learning_rate": 2.021816736884224e-06, "loss": 0.0, "step": 5974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.0546875, "completions/mean_terminated_length": 216.58509826660156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.06527203321456909, "epoch": 0.478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0213593746341334e-06, "loss": 0.0, "num_tokens": 275601699.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06667137518525124, "epoch": 0.47808, "grad_norm": 0.0, "learning_rate": 2.0209019933917527e-06, "loss": 0.0, "step": 5976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.3359375, "completions/mean_terminated_length": 191.09573364257812, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.068602304905653, "epoch": 0.47816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0204445931890934e-06, "loss": 0.0, "num_tokens": 275693902.0, "reward": 0.7945029735565186, "reward_std": 0.0, "rewards/reward_fn/mean": 0.7945029735565186, "rewards/reward_fn/std": 1.280464768409729, "step": 5977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07300743088126183, "epoch": 0.47824, "grad_norm": 0.0, "learning_rate": 2.0199871740581687e-06, "loss": 0.0, "step": 5978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.046875, "completions/mean_terminated_length": 225.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.07079877331852913, "epoch": 0.47832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.019529736030994e-06, "loss": 0.0, "num_tokens": 275790036.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06912478059530258, "epoch": 0.4784, "grad_norm": 0.0, "learning_rate": 2.019072279139584e-06, "loss": 0.0, "step": 5980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.5859375, "completions/mean_terminated_length": 239.68182373046875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.06662982329726219, "epoch": 0.47848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0186148034159568e-06, "loss": 0.0, "num_tokens": 275887263.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06791336089372635, "epoch": 0.47856, "grad_norm": 0.0, "learning_rate": 2.0181573088921304e-06, "loss": 0.0, "step": 5982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.453125, "completions/mean_terminated_length": 241.21621704101562, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.07077911868691444, "epoch": 0.47864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.017699795600125e-06, "loss": 0.0, "num_tokens": 275984473.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07131555303931236, "epoch": 0.47872, "grad_norm": 0.0, "learning_rate": 2.0172422635719607e-06, "loss": 0.0, "step": 5984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.140625, "completions/mean_terminated_length": 206.42222595214844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.06727280840277672, "epoch": 0.4788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0167847128396607e-06, "loss": 0.0, "num_tokens": 276078315.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07138136774301529, "epoch": 0.47888, "grad_norm": 0.0, "learning_rate": 2.0163271434352487e-06, "loss": 0.0, "step": 5986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.390625, "completions/mean_terminated_length": 236.01998901367188, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.06517364829778671, "epoch": 0.47896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0158695553907497e-06, "loss": 0.0, "num_tokens": 276174621.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06470508128404617, "epoch": 0.47904, "grad_norm": 0.0, "learning_rate": 2.01541194873819e-06, "loss": 0.0, "step": 5988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 242.5234375, "completions/mean_terminated_length": 237.64892578125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.06376069784164429, "epoch": 0.47912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0149543235095972e-06, "loss": 0.0, "num_tokens": 276271200.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06515286862850189, "epoch": 0.4792, "grad_norm": 0.0, "learning_rate": 2.014496679737e-06, "loss": 0.0, "step": 5990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.796875, "completions/mean_terminated_length": 224.0779266357422, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.06679666414856911, "epoch": 0.47928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0140390174524288e-06, "loss": 0.0, "num_tokens": 276367046.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_fn/mean": 0.0, "rewards/reward_fn/std": 0.0, "step": 5991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06440019235014915, "epoch": 0.47936, "grad_norm": 0.0, "learning_rate": 2.013581336687915e-06, "loss": 0.0, "step": 5992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 198.8984375, "completions/mean_terminated_length": 180.64947509765625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.07132861763238907, "epoch": 0.47944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0131236374754916e-06, "loss": 0.0, "num_tokens": 276458041.0, "reward": 0.40443697571754456, "reward_std": 0.0, "rewards/reward_fn/mean": 0.40443697571754456, "rewards/reward_fn/std": 0.9879210591316223, "step": 5993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08093470335006714, "epoch": 0.47952, "grad_norm": 0.0, "learning_rate": 2.0126659198471934e-06, "loss": 0.0, "step": 5994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.7890625, "completions/mean_terminated_length": 214.19117736816406, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.06683529540896416, "epoch": 0.4796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0122081838350543e-06, "loss": 0.0, "num_tokens": 276553502.0, "reward": 0.75, "reward_std": 0.0, "rewards/reward_fn/mean": 0.75, "rewards/reward_fn/std": 1.3041423559188843, "step": 5995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06620275229215622, "epoch": 0.47968, "grad_norm": 0.0, "learning_rate": 2.0117504294711125e-06, "loss": 0.0, "step": 5996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.6484375, "completions/mean_terminated_length": 194.29348754882812, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.06985344365239143, "epoch": 0.47976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.0112926567874045e-06, "loss": 0.0, "num_tokens": 276646129.0, "reward": 0.375, "reward_std": 0.0, "rewards/reward_fn/mean": 0.375, "rewards/reward_fn/std": 0.9960551857948303, "step": 5997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06989974901080132, "epoch": 0.47984, "grad_norm": 0.0, "learning_rate": 2.0108348658159706e-06, "loss": 0.0, "step": 5998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.53125, "completions/mean_terminated_length": 209.1627960205078, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.07054048776626587, "epoch": 0.47992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.010377056588851e-06, "loss": 0.0, "num_tokens": 276740405.0, "reward": 0.12318844348192215, "reward_std": 0.0, "rewards/reward_fn/mean": 0.12318844348192215, "rewards/reward_fn/std": 0.32720664143562317, "step": 5999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06769870966672897, "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 2.0099192291380878e-06, "loss": 0.0, "step": 6000 } ], "logging_steps": 1, "max_steps": 12500, "num_input_tokens_seen": 276740405, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }